Compare commits

..

No commits in common. "172729bdffc716b386a5fb603f814dc4b59f1b44" and "70df3018235ad6302ec0fea34e5f7d41a6bda748" have entirely different histories.

25 changed files with 66 additions and 2744 deletions

View file

@ -1,79 +0,0 @@
# cleanup-failed.yml — workflow_dispatch only.
#
# Tears down the kept-alive failed-deploy color (the inactive one
# that survived a Phase D / Phase F failure for forensics).
# Operator triggers this once they have read the journalctl output.
#
# Hard safety in playbooks/cleanup_failed.yml: refuses to destroy
# the currently-active color.
name: Veza cleanup failed-deploy color
on:
workflow_dispatch:
inputs:
env:
description: "Environment to clean up"
required: true
type: choice
options: [staging, prod]
color:
description: "Color to destroy (must NOT be the active one)"
required: true
type: choice
options: [blue, green]
concurrency:
group: cleanup-${{ inputs.env }}
cancel-in-progress: false
jobs:
cleanup:
name: Destroy ${{ inputs.color }} app containers in ${{ inputs.env }}
runs-on: [self-hosted, incus]
timeout-minutes: 10
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Install ansible
run: |
sudo apt-get update -qq
sudo apt-get install -y ansible
ansible-galaxy collection install community.general
- name: Write vault password
env:
VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
run: |
printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
chmod 0400 "$RUNNER_TEMP/vault-pass"
echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
- name: Run cleanup_failed.yml
working-directory: infra/ansible
env:
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-cleanup-${{ inputs.env }}-${{ inputs.color }}.log
ANSIBLE_HOST_KEY_CHECKING: "False"
run: |
ansible-playbook \
-i inventory/${{ inputs.env }}.yml \
playbooks/cleanup_failed.yml \
--vault-password-file "$VAULT_PASS_FILE" \
-e veza_env=${{ inputs.env }} \
-e target_color=${{ inputs.color }}
- name: Upload Ansible log
if: always()
uses: actions/upload-artifact@v4
with:
name: ansible-cleanup-${{ inputs.env }}-${{ inputs.color }}
path: ${{ runner.temp }}/ansible-cleanup-*.log
retention-days: 30
- name: Shred vault password file
if: always()
run: |
if [ -f "$VAULT_PASS_FILE" ]; then
shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
fi

View file

@ -1,358 +0,0 @@
# Veza deploy pipeline.
#
# Triggers (intentionally narrow — see SECURITY note below):
# push:main → env=staging, sha=$GITHUB_SHA
# push:tags ['v*'] → env=prod, sha=$GITHUB_SHA (tag's pointee)
# workflow_dispatch → operator-supplied env + sha
#
# SECURITY: this workflow runs on a self-hosted runner with access to
# the Incus unix socket (effectively root on the host). DO NOT add
# `pull_request` or any fork-influenced trigger here — an attacker-
# controlled fork would be able to `incus exec` arbitrarily. The
# narrow trigger list above is the security boundary.
#
# Sequence : build (3 jobs in parallel) → upload artifacts → deploy.
name: Veza deploy
on:
push:
branches: [main]
tags: ['v*']
workflow_dispatch:
inputs:
env:
description: "Environment to deploy"
required: true
default: staging
type: choice
options: [staging, prod]
release_sha:
description: "Full git SHA to deploy (defaults to current HEAD if empty)"
required: false
type: string
concurrency:
# Only one deploy per env at a time. Newer pushes cancel older
# in-flight builds for the same env (the user almost always wants
# the newer commit).
group: deploy-${{ github.ref_type == 'tag' && 'prod' || 'staging' }}
cancel-in-progress: true
env:
# Where build artefacts land. Set in Forgejo repo Variables :
# FORGEJO_REGISTRY_URL = https://forgejo.veza.fr/api/packages/talas/generic
REGISTRY_URL: ${{ vars.FORGEJO_REGISTRY_URL }}
jobs:
# =================================================================
# Resolve env + sha from the trigger.
# =================================================================
resolve:
name: Resolve env + SHA
runs-on: ubuntu-latest
outputs:
env: ${{ steps.r.outputs.env }}
sha: ${{ steps.r.outputs.sha }}
steps:
- name: Resolve
id: r
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
ENV="${{ inputs.env }}"
SHA="${{ inputs.release_sha || github.sha }}"
elif [ "${{ github.ref_type }}" = "tag" ]; then
ENV="prod"
SHA="${{ github.sha }}"
else
ENV="staging"
SHA="${{ github.sha }}"
fi
if ! echo "$SHA" | grep -Eq '^[0-9a-f]{40}$'; then
echo "SHA '$SHA' is not a 40-char git SHA"
exit 1
fi
echo "env=$ENV" >> "$GITHUB_OUTPUT"
echo "sha=$SHA" >> "$GITHUB_OUTPUT"
echo "Resolved env=$ENV sha=$SHA"
# =================================================================
# Build backend (Go).
# =================================================================
build-backend:
name: Build backend
needs: resolve
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
ref: ${{ needs.resolve.outputs.sha }}
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.25"
cache: true
cache-dependency-path: veza-backend-api/go.sum
- name: Test
working-directory: veza-backend-api
env:
VEZA_SKIP_INTEGRATION: "1"
run: go test ./... -short -count=1 -timeout 300s
- name: Build veza-api (CGO=0, static)
working-directory: veza-backend-api
env:
CGO_ENABLED: "0"
GOOS: linux
GOARCH: amd64
run: |
go build -trimpath -ldflags "-s -w" \
-o ./bin/veza-api ./cmd/api/main.go
go build -trimpath -ldflags "-s -w" \
-o ./bin/migrate_tool ./cmd/migrate_tool/main.go
- name: Stage tarball contents
working-directory: veza-backend-api
run: |
STAGE="$RUNNER_TEMP/veza-backend"
mkdir -p "$STAGE/migrations"
cp ./bin/veza-api ./bin/migrate_tool "$STAGE/"
cp -r ./migrations/* "$STAGE/migrations/" || true
echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
- name: Pack tarball
run: |
cd "$RUNNER_TEMP"
tar --use-compress-program=zstd -cf \
"veza-backend-${{ needs.resolve.outputs.sha }}.tar.zst" \
-C "$RUNNER_TEMP/veza-backend" .
- name: Push to Forgejo Package Registry
env:
TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
run: |
set -e
TARBALL="veza-backend-${{ needs.resolve.outputs.sha }}.tar.zst"
URL="${REGISTRY_URL}/veza-backend/${{ needs.resolve.outputs.sha }}/${TARBALL}"
echo "PUT → $URL"
curl -fsSL --fail-with-body -X PUT \
-H "Authorization: token ${TOKEN}" \
--upload-file "$RUNNER_TEMP/${TARBALL}" \
"${URL}"
# =================================================================
# Build stream (Rust).
# =================================================================
build-stream:
name: Build stream
needs: resolve
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
ref: ${{ needs.resolve.outputs.sha }}
- name: Set up Rust toolchain
run: |
command -v rustup >/dev/null || \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
source "$HOME/.cargo/env"
rustup target add x86_64-unknown-linux-musl
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
sudo apt-get update -qq && sudo apt-get install -y musl-tools
- name: Cache cargo + target
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
veza-stream-server/target
key: deploy-${{ runner.os }}-cargo-${{ hashFiles('veza-stream-server/Cargo.lock') }}
restore-keys: |
deploy-${{ runner.os }}-cargo-
- name: Test
working-directory: veza-stream-server
run: cargo test --workspace
- name: Build stream_server (musl static)
working-directory: veza-stream-server
run: |
cargo build --release --locked \
--target x86_64-unknown-linux-musl
- name: Stage tarball contents
working-directory: veza-stream-server
run: |
STAGE="$RUNNER_TEMP/veza-stream"
mkdir -p "$STAGE"
cp ./target/x86_64-unknown-linux-musl/release/stream_server "$STAGE/"
echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
- name: Pack tarball
run: |
cd "$RUNNER_TEMP"
tar --use-compress-program=zstd -cf \
"veza-stream-${{ needs.resolve.outputs.sha }}.tar.zst" \
-C "$RUNNER_TEMP/veza-stream" .
- name: Push to Forgejo Package Registry
env:
TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
run: |
set -e
TARBALL="veza-stream-${{ needs.resolve.outputs.sha }}.tar.zst"
URL="${REGISTRY_URL}/veza-stream/${{ needs.resolve.outputs.sha }}/${TARBALL}"
echo "PUT → $URL"
curl -fsSL --fail-with-body -X PUT \
-H "Authorization: token ${TOKEN}" \
--upload-file "$RUNNER_TEMP/${TARBALL}" \
"${URL}"
# =================================================================
# Build web (React/Vite).
# =================================================================
build-web:
name: Build web
needs: resolve
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
ref: ${{ needs.resolve.outputs.sha }}
- name: Use Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
cache: "npm"
cache-dependency-path: package-lock.json
- name: Install dependencies
run: npm ci
- name: Build design tokens
run: npm run build:tokens --workspace=@veza/design-system
- name: Build SPA
working-directory: apps/web
env:
VITE_API_URL: /api/v1
VITE_DOMAIN: ${{ needs.resolve.outputs.env == 'prod' && 'veza.fr' || 'staging.veza.fr' }}
VITE_RELEASE_SHA: ${{ needs.resolve.outputs.sha }}
run: npm run build
- name: Stage tarball contents
run: |
STAGE="$RUNNER_TEMP/veza-web"
mkdir -p "$STAGE"
cp -r apps/web/dist/* "$STAGE/"
echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
- name: Pack tarball
run: |
cd "$RUNNER_TEMP"
tar --use-compress-program=zstd -cf \
"veza-web-${{ needs.resolve.outputs.sha }}.tar.zst" \
-C "$RUNNER_TEMP/veza-web" .
- name: Push to Forgejo Package Registry
env:
TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
run: |
set -e
TARBALL="veza-web-${{ needs.resolve.outputs.sha }}.tar.zst"
URL="${REGISTRY_URL}/veza-web/${{ needs.resolve.outputs.sha }}/${TARBALL}"
echo "PUT → $URL"
curl -fsSL --fail-with-body -X PUT \
-H "Authorization: token ${TOKEN}" \
--upload-file "$RUNNER_TEMP/${TARBALL}" \
"${URL}"
# =================================================================
# Deploy via Ansible. Runs on the self-hosted runner that has
# Incus socket access (label `incus`). Requires Forgejo secrets:
# ANSIBLE_VAULT_PASSWORD — unlocks group_vars/all/vault.yml
# FORGEJO_REGISTRY_TOKEN — same token the build jobs use,
# passed to ansible-playbook so
# the data containers can fetch
# the tarballs they were just sent.
# =================================================================
deploy:
name: Deploy via Ansible
needs: [resolve, build-backend, build-stream, build-web]
runs-on: [self-hosted, incus]
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
ref: ${{ needs.resolve.outputs.sha }}
- name: Install ansible + community.general + community.postgresql + community.rabbitmq
run: |
sudo apt-get update -qq
sudo apt-get install -y ansible python3-psycopg2 python3-pip
ansible-galaxy collection install \
community.general \
community.postgresql \
community.rabbitmq
- name: Write vault password to a tmpfile
env:
VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
run: |
printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
chmod 0400 "$RUNNER_TEMP/vault-pass"
echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
- name: Run deploy_data.yml (idempotent provisioning + ZFS snapshot)
working-directory: infra/ansible
env:
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-data-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}.log
ANSIBLE_HOST_KEY_CHECKING: "False"
run: |
ansible-playbook \
-i inventory/${{ needs.resolve.outputs.env }}.yml \
playbooks/deploy_data.yml \
--vault-password-file "$VAULT_PASS_FILE" \
-e veza_env=${{ needs.resolve.outputs.env }} \
-e veza_release_sha=${{ needs.resolve.outputs.sha }} \
-e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}
- name: Run deploy_app.yml (blue/green)
working-directory: infra/ansible
env:
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-app-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}.log
ANSIBLE_HOST_KEY_CHECKING: "False"
run: |
ansible-playbook \
-i inventory/${{ needs.resolve.outputs.env }}.yml \
playbooks/deploy_app.yml \
--vault-password-file "$VAULT_PASS_FILE" \
-e veza_env=${{ needs.resolve.outputs.env }} \
-e veza_release_sha=${{ needs.resolve.outputs.sha }} \
-e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}
- name: Upload Ansible logs (for forensics)
if: always()
uses: actions/upload-artifact@v4
with:
name: ansible-logs-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}
path: ${{ runner.temp }}/ansible-*.log
retention-days: 30
- name: Shred vault password file
if: always()
run: |
if [ -f "$VAULT_PASS_FILE" ]; then
shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
fi

View file

@ -1,118 +0,0 @@
# rollback.yml — workflow_dispatch only.
#
# Two modes :
# fast — flip HAProxy back to the previous color. ~5s. Requires
# the target color's containers to still be alive
# (i.e., no later deploy has recycled them).
# full — re-run deploy_app.yml with a specific (older) release_sha.
# ~5-10min. The artefact must still be in the Forgejo
# registry (default retention 30 SHA per component).
#
# See docs/RUNBOOK_ROLLBACK.md for decision criteria.
name: Veza rollback
on:
workflow_dispatch:
inputs:
env:
description: "Environment to rollback"
required: true
type: choice
options: [staging, prod]
mode:
description: "Rollback mode"
required: true
type: choice
options: [fast, full]
target_color:
description: "(mode=fast only) color to flip back TO (the prior active one)"
required: false
type: choice
options: [blue, green]
release_sha:
description: "(mode=full only) 40-char SHA of the release to redeploy"
required: false
type: string
concurrency:
group: rollback-${{ inputs.env }}
cancel-in-progress: false
jobs:
rollback:
name: Rollback ${{ inputs.env }} (${{ inputs.mode }})
runs-on: [self-hosted, incus]
timeout-minutes: 30
steps:
- name: Validate inputs
run: |
if [ "${{ inputs.mode }}" = "fast" ] && [ -z "${{ inputs.target_color }}" ]; then
echo "mode=fast requires target_color"
exit 1
fi
if [ "${{ inputs.mode }}" = "full" ]; then
if [ -z "${{ inputs.release_sha }}" ]; then
echo "mode=full requires release_sha"
exit 1
fi
if ! echo "${{ inputs.release_sha }}" | grep -Eq '^[0-9a-f]{40}$'; then
echo "release_sha is not a 40-char git SHA"
exit 1
fi
fi
- uses: actions/checkout@v4
with:
fetch-depth: 1
ref: ${{ inputs.mode == 'full' && inputs.release_sha || github.ref }}
- name: Install ansible + collections
run: |
sudo apt-get update -qq
sudo apt-get install -y ansible python3-psycopg2
ansible-galaxy collection install \
community.general \
community.postgresql \
community.rabbitmq
- name: Write vault password
env:
VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
run: |
printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
chmod 0400 "$RUNNER_TEMP/vault-pass"
echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
- name: Run rollback.yml
working-directory: infra/ansible
env:
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-rollback-${{ inputs.env }}-${{ inputs.mode }}.log
ANSIBLE_HOST_KEY_CHECKING: "False"
run: |
EXTRA="-e veza_env=${{ inputs.env }} -e mode=${{ inputs.mode }}"
if [ "${{ inputs.mode }}" = "fast" ]; then
EXTRA="$EXTRA -e target_color=${{ inputs.target_color }}"
else
EXTRA="$EXTRA -e veza_release_sha=${{ inputs.release_sha }}"
EXTRA="$EXTRA -e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}"
fi
ansible-playbook \
-i inventory/${{ inputs.env }}.yml \
playbooks/rollback.yml \
--vault-password-file "$VAULT_PASS_FILE" \
$EXTRA
- name: Upload Ansible log
if: always()
uses: actions/upload-artifact@v4
with:
name: ansible-rollback-${{ inputs.env }}-${{ inputs.mode }}
path: ${{ runner.temp }}/ansible-rollback-*.log
retention-days: 30
- name: Shred vault password file
if: always()
run: |
if [ -f "$VAULT_PASS_FILE" ]; then
shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
fi

11
.gitignore vendored
View file

@ -265,14 +265,3 @@ frontend_screenshots/
# Audit_remediation glob (supersedes J2's exact-match json)
apps/web/audit_remediation*
# ============================================================
# Ansible Vault — secrets at rest stay encrypted in vault.yml
# (committed). The vault password used to unlock them MUST NOT
# be committed; the Forgejo runner reads it from a repo secret.
# ============================================================
infra/ansible/.vault-pass
infra/ansible/.vault-pass.*
# Local copies devs sometimes drop next to the repo for editing
.vault-pass
.vault-pass.*

View file

@ -1,111 +0,0 @@
# Canary release — backend-api
> **Audience** : on-call engineer running a release.
> **Trigger** : a new backend-api binary signed-off for prod.
> **Owner** : whoever's on the deploy rota that day.
The canary recipe ships the new binary to **one** backend at a time, watches the SLI for a window, and only continues to the next backend when the SLI stays green. If the SLI breaches at any point, the canary node rolls back automatically to the last-known-good binary.
## Trigger conditions
Run the canary script when one of these is true :
- A normal feature release. New code path, no schema migration that requires lockstep coordination.
- A hot-fix on a Sev-2 or below issue. Sev-1 (security or data-integrity) follows the all-stop rotate path documented in `docs/runbooks/INCIDENT_RESPONSE.md` instead.
## Pre-flight checklist
- [ ] **Migration backward-compat** : the latest schema migration is additive only — no `DROP COLUMN`, no `ALTER COLUMN ... TYPE`, no `ADD COLUMN ... NOT NULL` without `DEFAULT`. The script's pre-deploy hook (`scripts/check-migration-backward-compat.sh`) refuses to proceed when it finds one ; bypass with `FORCE_MIGRATE=1` only after you've split the migration in your head.
- [ ] **Last-known-good binary** is preserved. Either : (a) the previous release's `veza-api` is still on the host at `/opt/veza/backend-api/veza-api.previous`, OR (b) you have it locally and pass `ROLLBACK_BINARY=/path/to/old/veza-api` as env to the script.
- [ ] **Prometheus reachable** from the deploy host. The SLI monitor queries `${PROM_URL}` (default `http://prom.lxd:9090`) every `${SLI_PROBE_INTERVAL}` seconds for 1 hour.
- [ ] **HAProxy admin socket reachable** : the script execs into the haproxy Incus container to drive `set server ${POOL}/${NODE} state drain|ready` via socat.
- [ ] **No game day in the same window.** Canary needs a quiet baseline ; chaos drills will push the SLI red and trigger a false rollback.
## How
### One-shot via Make
```bash
make deploy-canary ARTIFACT=/tmp/veza-api-v1.0.10
```
The Make target wraps the script with reasonable defaults. Override any env (see the script header) by exporting before the `make` call.
### Direct script invocation
```bash
ARTIFACT=/tmp/veza-api-v1.0.10 \
ROLLBACK_BINARY=/opt/veza/backend-api/veza-api.previous \
SLI_WINDOW=3600 \
PROM_URL=http://prom.lxd:9090 \
bash scripts/deploy-canary.sh
```
The script is idempotent on the steps that matter : draining an already-drained server is a no-op ; pushing the same binary twice is a no-op (file mtime invariant). Re-runs after a partial failure are safe.
## What happens, in order
1. **Pre-deploy hook** runs `scripts/check-migration-backward-compat.sh` on the new-since-`origin/main` migration files. Forbidden patterns abort the deploy.
2. **Drain `CANARY_NODE`** (default `backend-api-2`) via the HAProxy admin socket. Wait until the node has 0 active connections.
3. **Push the binary** to `/opt/veza/backend-api/veza-api` on the canary container. `systemctl restart veza-backend-api`.
4. **Per-node health check** : `curl http://127.0.0.1:8080/api/v1/health` from inside the container. If the node doesn't return 200 within 60 s, rollback.
5. **Re-enable** the canary node in HAProxy.
6. **LB-side health check** : `curl http://haproxy.lxd${HEALTH_PATH}` returns 200 (proves HAProxy sees the node ready and routes through it).
7. **SLI monitor** for `SLI_WINDOW` seconds (default 3600 = 1h). Probes Prometheus every `SLI_PROBE_INTERVAL` (default 30 s) for :
- p95 of `veza_gin_http_request_duration_seconds_bucket` < `PROM_P95_THRESHOLD_S` (0.5 s)
- error rate (5xx ÷ total) < `PROM_ERR_RATE_THRESHOLD` (0.005 = 0.5%)
First red probe → rollback.
8. **Roll the peers** : for each `PEER_NODES` entry (default `backend-api-1`), repeat steps 26 (drain → deploy → health → re-enable → LB check). The peer roll skips the SLI monitor because the canary already proved the SLI ; if a peer-specific failure happens (binary corrupt on push, container disk full), the script bails out.
## Rollback path
The script handles the canary rollback automatically when :
- The pre-deploy hook fails. Nothing is changed ; nothing to revert.
- The canary's health check fails after the deploy. Old binary restored from `ROLLBACK_BINARY`, canary re-enabled.
- The SLI breaches during the monitor window. Same as above.
The script does **NOT** rollback peers automatically — by the time peers are rolling, the canary has already accumulated a green-SLI window. A peer health failure is an artifact of the deploy step (corrupt push, container memory issue), not of the new binary itself, and re-running after fixing the local issue is safer than ping-ponging the binary.
## Manual rollback (full)
When the script doesn't catch the regression — say a slow leak that surfaces after the SLI window closes — the on-call manually drives :
```bash
# Find which backend is on the new binary :
incus exec backend-api-1 -- ls -la /opt/veza/backend-api/veza-api
incus exec backend-api-2 -- ls -la /opt/veza/backend-api/veza-api
# Rotate both back to the previous binary :
for ct in backend-api-1 backend-api-2; do
incus exec "$ct" -- mv /opt/veza/backend-api/veza-api.previous /opt/veza/backend-api/veza-api
incus exec "$ct" -- systemctl restart veza-backend-api
done
```
The previous binary is conventionally kept at `${INSTALL_DIR}/veza-api.previous` ; the canary script does NOT copy the current binary there before overwriting (deliberate — that's a deploy-pipeline responsibility, not a per-canary responsibility).
## Configuration knobs
All of these are env vars — the script header is the source of truth for defaults.
| Knob | Default | When to change |
| ----------------------------- | ----------------------------- | ----------------------------------------------------- |
| `POOL_BACKEND` | `api_pool` | If you renamed the HAProxy backend |
| `CANARY_NODE` | `backend-api-2` | Toggle which node receives the canary first |
| `PEER_NODES` | `backend-api-1` | When the fleet grows beyond 2 nodes |
| `SLI_WINDOW` | `3600` (1 h) | Shorten for hot-fixes (300 = 5 min minimum) |
| `SLI_PROBE_INTERVAL` | `30` s | Tighter probes catch a leak faster but cost Prom load |
| `PROM_P95_THRESHOLD_S` | `0.5` | Match the SLO ; loosening it hides regressions |
| `PROM_ERR_RATE_THRESHOLD` | `0.005` (0.5 %) | Match the SLO |
| `ROLLBACK_BINARY` | (unset) | Always set in a real run — auto-rollback can't work without it |
## Acceptance bar (Day 23)
Per `docs/ROADMAP_V1.0_LAUNCH.md` : 3 canary deploys on staging, 2 normal + 1 with a deliberate rollback (e.g. push a binary that hardcodes a 500 on `/api/v1/health`). The rollback exercise verifies the script's auto-revert path actually fires.
## What this doesn't do
- **Cross-LB rolls** : single haproxy assumed. When phase-2 adds keepalived + a second LB, the canary script will need a `--lb-set` arg to roll the LB pair too.
- **Database migrations** : split-read-write migrations (e.g. dual-write during a rename) need a multi-step deploy that this script doesn't model. For now, only additive migrations are supported through the canary.
- **Stream-server canary** : the Rust streamer follows a separate playbook (URI-hash routing means a per-track-id affinity, not a per-session affinity). Same principles apply but the script is backend-api-specific.

View file

@ -1,67 +0,0 @@
# `group_vars/` layout
Three layers, in order of precedence (later wins):
1. `all/main.yml` — defaults shared across every inventory. Cross-cutting
values like SSH hardening, monitoring agent version, and the Veza
deploy contract (artifact URL, base image, ports, health probes).
2. `<env>.yml` — environment overrides. Today: `staging.yml`, `prod.yml`
(and `lab.yml` would live here too if `inventory/lab.yml` ever
referenced an `all/lab` group). Targets that pin the Incus host,
container prefix, public domain, log level, feature flags.
3. `all/vault.yml` — encrypted secrets (Ansible Vault). All entries
prefixed `vault_*`. Plaintext template at `all/vault.yml.example`.
## Bootstrapping the vault
The vault file is **not** committed at first. To stand it up:
```bash
cd infra/ansible
cp group_vars/all/vault.yml.example group_vars/all/vault.yml
$EDITOR group_vars/all/vault.yml # fill in <TODO> placeholders
ansible-vault encrypt group_vars/all/vault.yml
echo "<your strong vault password>" > .vault-pass
chmod 0400 .vault-pass
```
`.vault-pass` is gitignored — never commit it. The Forgejo runner
gets the same password from the `ANSIBLE_VAULT_PASSWORD` repo secret
(see `.forgejo/workflows/deploy.yml`).
To edit later without decrypting on disk:
```bash
ansible-vault edit group_vars/all/vault.yml
```
To rotate the password (e.g., when an operator leaves):
```bash
ansible-vault rekey group_vars/all/vault.yml
echo "<new password>" > .vault-pass
# update Forgejo secret ANSIBLE_VAULT_PASSWORD to the new value
```
## How variables flow into containers
```
[Ansible runtime] [Container]
group_vars/all/main.yml ┐
group_vars/<env>.yml ├──→ roles/veza_app/templates/*.j2 ──→ /etc/veza/<component>.env
group_vars/all/vault.yml ┘ ──→ /etc/veza/secrets/jwt-private.pem
──→ systemd unit (EnvironmentFile=)
```
The systemd unit then reads `/etc/veza/<component>.env` at start time.
Reload semantics: a config change re-templates the env file and
notifies the systemd handler, which restarts the unit.
## What lives in `host_vars/`?
`host_vars/<host>.yml` for **per-host** overrides — typically when one
container in an HA group needs a slightly different config (e.g., the
postgres-primary needs `pg_auto_failover_role: node`, the monitor
needs `pg_auto_failover_role: monitor`). The lab inventory inlines
these as host-level vars; `host_vars/` exists for cases where they
shouldn't bloat the inventory file.

View file

@ -0,0 +1,40 @@
# Shared defaults across every inventory (lab/staging/prod). Override
# per-environment in `group_vars/<group>.yml` or per-host in
# `host_vars/<host>.yml`.
---
# Owner contact (used in some unattended-upgrades + monitoring agent configs).
veza_ops_email: ops@veza.fr
# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
# Override these in production via group_vars/veza_prod.yml when the
# bastion's specific port / allowed users are decided. Defaults are
# safe for lab.
ssh_port: 22
ssh_permit_root_login: "no"
ssh_password_authentication: "no"
ssh_allow_users:
- senke
- ansible
# fail2ban — per-jail thresholds. The defaults are conservative for
# a self-hosted single-machine deployment; production may want
# lower findtime / higher bantime once Forgejo + Veza traffic is
# baselined.
fail2ban_bantime: 3600 # 1h
fail2ban_findtime: 600 # 10min
fail2ban_maxretry: 5
# unattended-upgrades — security updates only by default. The role
# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
# downtime windows to controlled cycles, not OS-driven reboots.
unattended_upgrades_origins:
- "${distro_id}:${distro_codename}-security"
- "${distro_id}ESMApps:${distro_codename}-apps-security"
- "${distro_id}ESM:${distro_codename}-infra-security"
unattended_upgrades_auto_reboot: false
# Monitoring agent: prometheus node_exporter is the bare-minimum
# host metrics surface (CPU / memory / disk / network). The
# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
monitoring_node_exporter_version: "1.8.2"
monitoring_node_exporter_port: 9100

View file

@ -1,90 +0,0 @@
# Shared defaults across every inventory (lab/staging/prod). Override
# per-environment in `group_vars/<group>.yml` or per-host in
# `host_vars/<host>.yml`.
---
# Owner contact (used in some unattended-upgrades + monitoring agent configs).
veza_ops_email: ops@veza.fr
# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
# Override these in production via group_vars/veza_prod.yml when the
# bastion's specific port / allowed users are decided. Defaults are
# safe for lab.
ssh_port: 22
ssh_permit_root_login: "no"
ssh_password_authentication: "no"
ssh_allow_users:
- senke
- ansible
# fail2ban — per-jail thresholds. The defaults are conservative for
# a self-hosted single-machine deployment; production may want
# lower findtime / higher bantime once Forgejo + Veza traffic is
# baselined.
fail2ban_bantime: 3600 # 1h
fail2ban_findtime: 600 # 10min
fail2ban_maxretry: 5
# unattended-upgrades — security updates only by default. The role
# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
# downtime windows to controlled cycles, not OS-driven reboots.
unattended_upgrades_origins:
- "${distro_id}:${distro_codename}-security"
- "${distro_id}ESMApps:${distro_codename}-apps-security"
- "${distro_id}ESM:${distro_codename}-infra-security"
unattended_upgrades_auto_reboot: false
# Monitoring agent: prometheus node_exporter is the bare-minimum
# host metrics surface (CPU / memory / disk / network). The
# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
monitoring_node_exporter_version: "1.8.2"
monitoring_node_exporter_port: 9100
# ============================================================
# Veza app deploy — defaults shared by every environment.
# Each can be overridden in group_vars/{staging,prod}.yml.
# ============================================================
# Forgejo Package Registry where the deploy workflow pushes release
# tarballs. Forgejo's generic-package URL shape is:
# {base}/{owner}/generic/{package}/{version}/{filename}
# We treat each component as a separate package (`veza-backend`,
# `veza-stream`, `veza-web`), the SHA as the version, and the
# tarball name as the filename. Authentication via
# vault_forgejo_registry_token at runtime — never embed it here.
veza_artifact_base_url: "https://forgejo.veza.fr/api/packages/talas/generic"
# Container image used as the base for fresh app containers. The
# `veza_app` role apt-installs OS deps on top. Pinned tag keeps deploys
# reproducible across base-image updates.
veza_app_base_image: "images:debian/13"
# Per-component HTTP ports. Backend listens on `APP_PORT` env var;
# stream listens on `PORT` env var. Templates render these into env
# files; HAProxy reads them to wire backends.
veza_backend_port: 8080
veza_stream_port: 8082
veza_web_port: 80
# Health probe parameters — used by deploy_app's Phase D and by the
# rollback playbook when verifying a switched color.
veza_healthcheck_retries: 30
veza_healthcheck_delay_seconds: 2
veza_healthcheck_paths:
backend: /api/v1/health
stream: /health
web: /
# OS package set installed in every fresh app container. Component-
# specific extras live in roles/veza_app/vars/<component>.yml.
veza_common_os_packages:
- ca-certificates
- curl
- tzdata
- zstd # to decompress release tarballs
# Where artefacts land in-container. Per-SHA subdirs let multiple
# releases coexist for forensics without conflict.
veza_install_root: /opt/veza
veza_config_root: /etc/veza
veza_log_root: /var/log/veza
veza_state_root: /var/lib/veza

View file

@ -1,78 +0,0 @@
# Template for group_vars/all/vault.yml — the encrypted secrets store
# consumed by every playbook. Copy this file to vault.yml, fill in real
# values, then encrypt:
#
# cp vault.yml.example vault.yml
# $EDITOR vault.yml # fill in real values
# ansible-vault encrypt vault.yml # in place
# echo "<your strong password>" > ../../../.vault-pass # gitignored
# chmod 0400 ../../../.vault-pass
#
# After that, every `ansible-playbook` invocation needs:
# ansible-playbook --vault-password-file infra/ansible/.vault-pass ...
# The Forgejo deploy workflow handles this via the ANSIBLE_VAULT_PASSWORD
# repo secret (see .forgejo/workflows/deploy.yml).
#
# Naming: every secret is prefixed `vault_*` so it's grep-able and so
# `group_vars/all/main.yml` references like `postgres_password:
# "{{ vault_postgres_password }}"` are unambiguous.
---
# --- Database -----------------------------------------------------------
vault_postgres_password: "<TODO: 32+ char strong password for veza role>"
vault_postgres_replication_password: "<TODO: separate password for replication user>"
# --- Cache / queue ------------------------------------------------------
vault_redis_password: "<TODO>"
vault_rabbitmq_password: "<TODO>"
# --- Object storage (MinIO) ---------------------------------------------
vault_minio_root_user: "<TODO: only used to bootstrap the cluster>"
vault_minio_root_password: "<TODO: 16+ chars, MinIO refuses shorter>"
vault_minio_access_key: "<TODO: app-tier access key>"
vault_minio_secret_key: "<TODO: app-tier secret key>"
# --- JWT ----------------------------------------------------------------
# Backend prefers RS256 in prod. Generate with:
# openssl genrsa -out jwt-private.pem 4096
# openssl rsa -in jwt-private.pem -pubout -out jwt-public.pem
# Then base64 each:
# base64 -w0 jwt-private.pem
# base64 -w0 jwt-public.pem
vault_jwt_signing_key_b64: "<TODO: base64 of RS256 private PEM>"
vault_jwt_public_key_b64: "<TODO: base64 of RS256 public PEM>"
# Chat WebSocket signs its own short-lived tokens — must differ from the
# main JWT secret in production (defense in depth).
vault_chat_jwt_secret: "<TODO: 32+ chars, distinct from JWT signing key>"
# --- App-internal API keys ---------------------------------------------
# Backend ↔ stream-server shared secret. Both services must have the
# same value so /api/v1/internal/* requests authenticate.
vault_stream_internal_api_key: "<TODO: 32+ chars>"
# OAuth refresh tokens are encrypted at rest with this key.
vault_oauth_encryption_key: "<TODO: exactly 32 bytes, raw or hex>"
# --- Email --------------------------------------------------------------
vault_smtp_password: "<TODO>"
# --- Payments -----------------------------------------------------------
# Hyperswitch routes through Stripe Connect. Both keys are required if
# `HYPERSWITCH_ENABLED=true` in group_vars/<env>.yml.
vault_hyperswitch_api_key: "<TODO>"
vault_hyperswitch_webhook_secret: "<TODO>"
vault_stripe_secret_key: "<TODO: sk_live_… in prod, sk_test_… in staging>"
# --- OAuth providers ----------------------------------------------------
# Add only the providers you actually enable; keys consumed by
# templates/backend.env.j2 conditionally on truthiness.
vault_oauth_clients:
google:
id: "<TODO>"
secret: "<TODO>"
spotify:
id: "<TODO>"
secret: "<TODO>"
# --- Sentry / observability --------------------------------------------
vault_sentry_dsn: "<TODO: empty string disables Sentry>"

View file

@ -1,42 +0,0 @@
# Prod-specific overrides. Same R720 host as staging in v1.0; separate
# Incus network + container prefix prevents staging/prod from sharing
# any state. Phase-2 (post v1.1) is expected to move prod to a
# dedicated host, at which point only `veza_incus_host` flips.
---
veza_env: prod
veza_release_channel: prod
veza_incus_host: veza-prod
veza_incus_network: veza-net
veza_incus_subnet: 10.0.20.0/24
veza_container_prefix: "veza-" # production uses unprefixed names — the established convention
veza_incus_dns_suffix: lxd
haproxy_topology: blue-green
veza_public_host: veza.fr
veza_public_url: "https://veza.fr"
veza_cors_allowed_origins:
- "https://veza.fr"
- "https://app.veza.fr"
# Prod is INFO so 99th-percentile log volume stays manageable. Bump to
# DEBUG for a window via `ansible-playbook -e veza_log_level=DEBUG` if
# triaging an incident.
veza_log_level: INFO
veza_otel_sample_rate: "0.05"
veza_feature_flags:
HYPERSWITCH_ENABLED: "true"
STRIPE_CONNECT_ENABLED: "true"
WEBAUTHN_ENABLED: "true"
# Larger retention than staging — prod rollback may need to reach a
# release from up to a month ago when the cause was latent.
veza_release_retention: 60
postgres_password: "{{ vault_postgres_password }}"
redis_password: "{{ vault_redis_password }}"
rabbitmq_password: "{{ vault_rabbitmq_password }}"

View file

@ -1,67 +0,0 @@
# Staging-specific overrides. Targets the local R720 Incus daemon (the
# same host the Forgejo runner lives on). Containers prefixed `veza-*`
# share the `veza-net` Incus bridge (10.0.20.0/24).
#
# Phase-1 simplification: staging and prod coexist on the same R720 but
# on separate Incus networks (veza-staging-net 10.0.21.0/24 vs
# veza-prod-net 10.0.20.0/24) and separate container name prefixes
# (veza-staging-* vs veza-prod-*). When prod migrates off-box (Hetzner
# or similar), this file's `veza_incus_host` flips to that target.
---
veza_env: staging
veza_release_channel: staging
# Where the Incus daemon lives. Used by the deploy workflow to decide
# which inventory host's `community.general.incus` connection plugin
# to drive containers from.
veza_incus_host: veza-staging
veza_incus_network: veza-staging-net
veza_incus_subnet: 10.0.21.0/24
# Container name prefix — every app/data container ends up named
# `<veza_container_prefix><component>[-<color>]`. e.g.
# veza-staging-backend-blue, veza-staging-postgres.
veza_container_prefix: "veza-staging-"
# DNS suffix Incus assigns to managed containers. The HAProxy template
# resolves backends as `<container>.<suffix>`. Default `.lxd` works
# with the stock Incus DNS resolver; override if you've renamed the
# managed network's DNS zone.
veza_incus_dns_suffix: lxd
# HAProxy strategy for the staging stack: blue/green, two app
# containers per component (active + standby). Differs from the lab
# inventory which uses an active/active multi-instance pattern.
haproxy_topology: blue-green
# Public-facing URLs — used by backend for OAuth redirects, email
# links, CSP origins, and by HAProxy ACLs.
veza_public_host: staging.veza.fr
veza_public_url: "https://staging.veza.fr"
veza_cors_allowed_origins:
- "https://staging.veza.fr"
- "https://staging-app.veza.fr"
# Logging — staging keeps DEBUG to make incident triage easy. Prod
# drops to INFO. Tracing sample rate stays at 100% in staging
# (low traffic) and 5% in prod (cost).
veza_log_level: DEBUG
veza_otel_sample_rate: "1.0"
# Feature flags exposed to the backend at boot. Keep this list small —
# the backend's own .env.template is the canonical reference.
veza_feature_flags:
HYPERSWITCH_ENABLED: "false"
STRIPE_CONNECT_ENABLED: "false"
WEBAUTHN_ENABLED: "true"
# How many recent release SHAs the rollback workflow can target. Older
# tarballs are pruned by the Forgejo registry retention policy (set
# externally). 30 deploys ≈ a working week given the staging cadence.
veza_release_retention: 30
# Postgres password the migrations job uses — references vault.yml so
# rotation is one ansible-vault edit + one redeploy.
postgres_password: "{{ vault_postgres_password }}"
redis_password: "{{ vault_redis_password }}"
rabbitmq_password: "{{ vault_rabbitmq_password }}"

View file

@ -1,83 +0,0 @@
# cleanup_failed.yml — destroy the app containers of a specific color.
# Used when a deploy_app.yml run failed Phase D or Phase F and the
# operator has finished forensics on the kept-alive failed color.
#
# Required extra-vars:
# env staging | prod
# target_color blue | green (the color to tear down)
#
# Safety: refuses to destroy the CURRENTLY-ACTIVE color. Active color
# is read from the HAProxy container's /var/lib/veza/active-color.
#
# Caller (workflow_dispatch only):
# ansible-playbook -i inventory/{{env}}.yml playbooks/cleanup_failed.yml \
# -e env={{env}} -e target_color={{color}}
---
- name: Validate inputs and refuse to nuke the active color
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Assert required vars
ansible.builtin.assert:
that:
- veza_env is defined
- veza_env in ['staging', 'prod']
- target_color is defined
- target_color in ['blue', 'green']
fail_msg: cleanup_failed.yml requires veza_env + target_color.
quiet: true
- name: Read active color from HAProxy container
ansible.builtin.shell: |
incus exec "{{ veza_container_prefix }}haproxy" -- \
cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]'
args:
executable: /bin/bash
register: active_color_raw
changed_when: false
failed_when: false
- name: Resolve current_active_color
ansible.builtin.set_fact:
current_active_color: "{{ active_color_raw.stdout if active_color_raw.stdout else 'blue' }}"
- name: Refuse if target_color matches the active color
ansible.builtin.fail:
msg: >-
target_color={{ target_color }} matches the currently-active
color in HAProxy. Refusing to destroy live containers.
Switch HAProxy first via rollback.yml or a re-deploy.
when: target_color == current_active_color
- name: Destroy the inactive-color app containers
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Force-delete each component container
ansible.builtin.shell: |
set -e
CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
if incus info "$CT" >/dev/null 2>&1; then
incus delete --force "$CT"
echo "Destroyed $CT"
else
echo "$CT does not exist, skip"
fi
args:
executable: /bin/bash
loop:
- backend
- stream
- web
register: cleanup_result
changed_when: "'Destroyed' in (cleanup_result.stdout | default(''))"
tags: [cleanup]
- name: Report what was destroyed
ansible.builtin.debug:
msg: |
Cleanup of color {{ target_color }} in env {{ veza_env }} complete.
Active color unchanged: {{ current_active_color }}.
Next deploy will recreate {{ target_color }} containers from scratch.

View file

@ -1,355 +0,0 @@
# deploy_app.yml — second-half of every deploy. Runs AFTER
# deploy_data.yml has snapshot + ensured data services up.
#
# Phases (mirror docs/RUNBOOK_ROLLBACK.md):
# A — Run migrations in an ephemeral tools container.
# B — Read /var/lib/veza/active-color in the HAProxy container,
# compute inactive_color (the color we are deploying TO).
# C — Destroy + relaunch the three app containers in inactive_color.
# Apply roles/veza_app per component (artefact install + health
# probe).
# D — Implicit in C: veza_app role's probe.yml runs. If any color's
# probe fails, the playbook errors and Phase E is skipped (HAProxy
# still pointing at the prior active color).
# E — Switch HAProxy via roles/veza_haproxy_switch (block/rescue
# guards prior cfg).
# F — External verification : curl through HAProxy, fail the playbook
# (and reverse-switch) if the public health endpoint is < 200.
#
# Required extra-vars:
# env staging | prod
# release_sha 40-char git SHA
---
# =====================================================================
# Phase A — Migrations
# =====================================================================
- name: Phase A — apply database migrations
hosts: incus_hosts
become: true
gather_facts: true
tasks:
- name: Validate inputs
ansible.builtin.assert:
that:
- veza_env in ['staging', 'prod']
- veza_release_sha | length == 40
fail_msg: deploy_app.yml requires veza_env + veza_release_sha extra-vars.
quiet: true
- name: Ensure ephemeral tools container exists
ansible.builtin.shell: |
set -e
TOOLS="{{ veza_container_prefix }}backend-tools"
if ! incus info "$TOOLS" >/dev/null 2>&1; then
incus launch {{ veza_app_base_image }} "$TOOLS" \
--profile veza-app --profile veza-net \
--network "{{ veza_incus_network }}"
for i in $(seq 1 30); do
incus exec "$TOOLS" -- /bin/true 2>/dev/null && exit 0
sleep 1
done
echo "tools container did not become ready"
exit 1
fi
args:
executable: /bin/bash
register: tools_provision
changed_when: "'incus launch' in (tools_provision.stdout | default(''))"
tags: [phaseA, migrations]
- name: Refresh inventory so the tools container becomes reachable
ansible.builtin.meta: refresh_inventory
tags: [phaseA]
- name: Phase A — install backend artifact + run migrate_tool inside tools
hosts: "{{ veza_container_prefix + 'backend-tools' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
veza_component: backend
veza_target_color: tools # not blue/green — bypass color logic in name
tasks:
- name: Apt deps for tools container
ansible.builtin.apt:
name:
- ca-certificates
- curl
- postgresql-client
- libssl3
- zstd
state: present
update_cache: true
cache_valid_time: 3600
- name: Ensure migrate user
ansible.builtin.user:
name: veza-migrate
system: true
shell: /usr/sbin/nologin
- name: Ensure /opt/veza/migrate
ansible.builtin.file:
path: /opt/veza/migrate
state: directory
owner: veza-migrate
mode: "0755"
- name: Fetch backend tarball
ansible.builtin.get_url:
url: "{{ veza_artifact_base_url }}/veza-backend/{{ veza_release_sha }}/veza-backend-{{ veza_release_sha }}.tar.zst"
dest: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
mode: "0600"
headers:
Authorization: "token {{ vault_forgejo_registry_token | default('') }}"
force: false
- name: Extract tarball into /opt/veza/migrate
ansible.builtin.unarchive:
src: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
dest: "/opt/veza/migrate"
remote_src: true
owner: veza-migrate
creates: "/opt/veza/migrate/migrate_tool"
- name: Run migrate_tool
ansible.builtin.command: /opt/veza/migrate/migrate_tool --up
environment:
DATABASE_URL: "postgres://veza:{{ vault_postgres_password }}@{{ veza_container_prefix }}postgres.{{ veza_incus_dns_suffix }}:5432/veza?sslmode=disable"
register: migrate_result
changed_when: "'no changes' not in (migrate_result.stdout | default('').lower())"
no_log: true # DATABASE_URL contains the password
tags: [phaseA, migrations]
# =====================================================================
# Phase B — Determine inactive color
# =====================================================================
- name: Phase B — read active color, compute inactive_color
hosts: "{{ veza_container_prefix + 'haproxy' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Read currently-active color
ansible.builtin.slurp:
src: /var/lib/veza/active-color
register: prior_color_raw
failed_when: false
- name: Resolve prior_active_color (default blue if no history)
ansible.builtin.set_fact:
prior_active_color: >-
{{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
else 'blue' }}
cacheable: true
- name: Compute inactive_color (the one we deploy TO)
ansible.builtin.set_fact:
inactive_color: "{{ 'green' if prior_active_color == 'blue' else 'blue' }}"
cacheable: true
- name: Show what we are switching to
ansible.builtin.debug:
msg: >-
Deploying SHA {{ veza_release_sha[:12] }} to color
{{ inactive_color }} (currently active: {{ prior_active_color }}).
# =====================================================================
# Phase C — destroy + relaunch the three app containers in inactive_color
# =====================================================================
- name: Phase C — recreate inactive-color app containers (host-side)
hosts: incus_hosts
become: true
gather_facts: false
vars:
inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
tasks:
- name: Destroy + launch each component container
ansible.builtin.shell: |
set -e
CT="{{ veza_container_prefix }}{{ item }}-{{ inactive_color }}"
# Force-delete is fine — these are stateless app containers ; the
# active color is untouched.
incus delete --force "$CT" 2>/dev/null || true
incus launch {{ veza_app_base_image }} "$CT" \
--profile veza-app \
--profile veza-net \
--network "{{ veza_incus_network }}"
for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
if incus exec "$CT" -- /bin/true 2>/dev/null; then
exit 0
fi
sleep 1
done
echo "Container $CT did not become ready"
exit 1
args:
executable: /bin/bash
loop:
- backend
- stream
- web
changed_when: true
tags: [phaseC]
- name: Refresh inventory so freshly-launched containers become reachable
ansible.builtin.meta: refresh_inventory
tags: [phaseC]
- name: Phase C — provision backend (inactive color) via veza_app role
hosts: "{{ veza_container_prefix + 'backend-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
veza_component: backend
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
roles:
- veza_app
tags: [phaseC, backend]
- name: Phase C — provision stream (inactive color)
hosts: "{{ veza_container_prefix + 'stream-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
veza_component: stream
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
roles:
- veza_app
tags: [phaseC, stream]
- name: Phase C — provision web (inactive color)
hosts: "{{ veza_container_prefix + 'web-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
veza_component: web
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
roles:
- veza_app
tags: [phaseC, web]
# =====================================================================
# Phase D — cross-container probes (in addition to in-container probes
# that veza_app already ran). This catches the case where the service
# is up locally but unreachable via Incus DNS.
# =====================================================================
- name: Phase D — probe each component via Incus DNS (cross-container)
hosts: "{{ veza_container_prefix + 'haproxy' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Curl each component's health endpoint
ansible.builtin.uri:
url: "http://{{ veza_container_prefix }}{{ item.component }}-{{ inactive_color }}.{{ veza_incus_dns_suffix }}:{{ item.port }}{{ item.path }}"
method: GET
status_code: [200]
timeout: 5
register: cross_probe
retries: "{{ veza_healthcheck_retries }}"
delay: "{{ veza_healthcheck_delay_seconds }}"
until: cross_probe.status == 200
changed_when: false
loop:
- { component: backend, port: "{{ veza_backend_port }}", path: "{{ veza_healthcheck_paths.backend }}" }
- { component: stream, port: "{{ veza_stream_port }}", path: "{{ veza_healthcheck_paths.stream }}" }
- { component: web, port: "{{ veza_web_port }}", path: "{{ veza_healthcheck_paths.web }}" }
tags: [phaseD, probe]
# =====================================================================
# Phase E — switch HAProxy. roles/veza_haproxy_switch wraps render +
# validate + atomic-swap + HUP in a block/rescue that restores prior
# cfg on failure.
# =====================================================================
- name: Phase E — switch HAProxy to the new color
hosts: "{{ veza_container_prefix + 'haproxy' }}"
become: true
gather_facts: true # roles/veza_haproxy_switch wants ansible_date_time
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
veza_active_color: "{{ inactive_color }}" # the color we ARE switching TO
roles:
- veza_haproxy_switch
tags: [phaseE, switch]
# =====================================================================
# Phase F — Post-deploy verification (external curl through HAProxy).
# If this fails, we revert HAProxy to the prior color via a second run
# of veza_haproxy_switch and fail the playbook.
# =====================================================================
- name: Phase F — verify externally + record deploy state
hosts: incus_hosts
become: true
gather_facts: true
vars:
inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
prior_active_color: "{{ hostvars[veza_container_prefix + 'haproxy']['prior_active_color'] }}"
tasks:
- name: Curl public health endpoint via HAProxy
ansible.builtin.uri:
url: "{{ veza_public_url }}/api/v1/health"
method: GET
status_code: [200]
timeout: 10
validate_certs: "{{ veza_public_url.startswith('https://') }}"
register: public_health
retries: 10
delay: 3
until: public_health.status == 200
tags: [phaseF, verify]
- name: Write deploy-state.json (consumed by node-exporter textfile)
ansible.builtin.copy:
dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
content: |
# HELP veza_deploy_active_color 0=blue, 1=green.
# TYPE veza_deploy_active_color gauge
veza_deploy_active_color{env="{{ veza_env }}"} {{ 0 if inactive_color == 'blue' else 1 }}
# HELP veza_deploy_release_sha info metric, label=sha.
# TYPE veza_deploy_release_sha gauge
veza_deploy_release_sha{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} 1
# HELP veza_deploy_last_success_timestamp unix epoch of last successful deploy.
# TYPE veza_deploy_last_success_timestamp gauge
veza_deploy_last_success_timestamp{env="{{ veza_env }}"} {{ ansible_date_time.epoch }}
mode: "0644"
tags: [phaseF, metrics]
rescue:
- name: Public health failed — record the failure timestamp
ansible.builtin.copy:
dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
content: |
# HELP veza_deploy_last_failure_timestamp unix epoch of last failed deploy.
# TYPE veza_deploy_last_failure_timestamp gauge
veza_deploy_last_failure_timestamp{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} {{ ansible_date_time.epoch }}
mode: "0644"
failed_when: false
- name: Re-switch HAProxy back to the prior color
ansible.builtin.import_role:
name: veza_haproxy_switch
vars:
veza_active_color: "{{ prior_active_color }}"
delegate_to: "{{ veza_container_prefix + 'haproxy' }}"
- name: Fail the playbook
ansible.builtin.fail:
msg: >-
Public health probe via HAProxy failed after deploy of SHA
{{ veza_release_sha[:12] }} to color {{ inactive_color }}.
HAProxy reverted to the prior color ({{ prior_active_color }}).
The freshly-deployed {{ inactive_color }} containers are kept
alive for forensics — inspect with:
incus exec {{ veza_container_prefix }}backend-{{ inactive_color }} -- journalctl -u veza-backend -n 200

View file

@ -1,411 +0,0 @@
# deploy_data.yml — idempotent data-tier provisioning. Runs FIRST in
# every deploy. Three principles:
#
# 1. ZFS-snapshot every data container's dataset before doing
# anything else. The snapshot is the safety net for any later
# mistake in the same run.
# 2. Containers are created if absent, never destroyed. Volumes
# survive every deploy.
# 3. Service config drift is reconciled, but state-bearing things
# (data dirs, schema, MinIO buckets) are reload-not-restart
# where the daemon supports it.
#
# Required extra-vars:
# env one of staging|prod (selects inventory group_vars)
# release_sha git SHA of the release (snapshot label)
#
# Caller pattern in .forgejo/workflows/deploy.yml:
# ansible-playbook -i inventory/{{env}}.yml playbooks/deploy_data.yml \
# -e env={{env}} -e release_sha={{sha}}
---
- name: Pre-flight — validate inputs and resolve runtime context
hosts: incus_hosts
become: true
gather_facts: true
tasks:
- name: Assert required vars are set
ansible.builtin.assert:
that:
- veza_env is defined
- veza_env in ['staging', 'prod']
- veza_release_sha is defined
- veza_release_sha | length == 40
fail_msg: >-
deploy_data.yml requires veza_env (staging|prod) +
veza_release_sha (40-char SHA). Pass via -e on the
command line or via inventory group_vars.
- name: Compute the list of data containers we manage
ansible.builtin.set_fact:
veza_data_containers:
- name: "{{ veza_container_prefix }}postgres"
kind: postgres
- name: "{{ veza_container_prefix }}redis"
kind: redis
- name: "{{ veza_container_prefix }}rabbitmq"
kind: rabbitmq
- name: "{{ veza_container_prefix }}minio"
kind: minio
# -----------------------------------------------------------------------
# ZFS snapshot before mutation. A failed prune is logged but not fatal —
# safer to lose disk to retained snapshots than to skip the snapshot.
# -----------------------------------------------------------------------
- name: ZFS-snapshot every data container's dataset
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Snapshot per-container dataset
ansible.builtin.shell: |
set -e
# Best-effort dataset path resolution from `incus storage volume show`.
# If the container doesn't exist yet (first-ever deploy), skip — there's
# nothing to snapshot.
if ! incus info "{{ item.name }}" >/dev/null 2>&1; then
echo "Container {{ item.name }} does not yet exist, skip snapshot"
exit 0
fi
DATASET=$(zfs list -H -o name | grep -E "containers/{{ item.name }}$" | head -1 || true)
if [ -z "$DATASET" ]; then
echo "No ZFS dataset for {{ item.name }} — likely non-ZFS storage, skip"
exit 0
fi
SNAP_NAME="${DATASET}@pre-deploy-{{ veza_release_sha }}"
if zfs list -H -t snapshot "$SNAP_NAME" >/dev/null 2>&1; then
echo "Snapshot $SNAP_NAME already exists (idempotent rerun)"
exit 0
fi
zfs snapshot "$SNAP_NAME"
echo "Created $SNAP_NAME"
args:
executable: /bin/bash
loop: "{{ veza_data_containers }}"
register: snap_result
changed_when: "'Created' in (snap_result.stdout | default(''))"
tags: [data, zfs, snapshot]
- name: Prune ZFS snapshots beyond retention window
ansible.builtin.shell: |
set -e
# Keep the {{ veza_release_retention | default(30) }} most-recent
# pre-deploy snapshots per dataset ; delete the rest.
for dataset in $(zfs list -H -o name | grep -E "containers/{{ veza_container_prefix }}(postgres|redis|rabbitmq|minio)$"); do
zfs list -H -t snapshot -o name -s creation "$dataset" \
| grep "@pre-deploy-" \
| head -n -{{ veza_release_retention | default(30) }} \
| xargs -r -n1 zfs destroy -r || true
done
args:
executable: /bin/bash
changed_when: false
failed_when: false
tags: [data, zfs, prune]
# -----------------------------------------------------------------------
# Provision (create-if-absent) each data container. We don't recreate
# existing ones — they own state.
# -----------------------------------------------------------------------
- name: Ensure data containers exist
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Launch container if absent
ansible.builtin.shell: |
set -e
if incus info "{{ item.name }}" >/dev/null 2>&1; then
echo "{{ item.name }} already exists"
exit 0
fi
incus launch {{ veza_app_base_image }} "{{ item.name }}" \
--profile veza-data \
--profile veza-net \
--network "{{ veza_incus_network }}"
# Wait for the container's API to respond before any subsequent task
# (apt, systemd) hits a half-up container.
for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
if incus exec "{{ item.name }}" -- /bin/true 2>/dev/null; then
echo "Container {{ item.name }} ready"
exit 0
fi
sleep 1
done
echo "Container {{ item.name }} did not become ready within timeout"
exit 1
args:
executable: /bin/bash
loop: "{{ veza_data_containers }}"
register: launch_result
changed_when: "'Container' in (launch_result.stdout | default('')) and 'ready' in (launch_result.stdout | default(''))"
tags: [data, provision]
- name: Refresh inventory so the new containers become reachable
ansible.builtin.meta: refresh_inventory
tags: [data, provision]
# -----------------------------------------------------------------------
# Per-kind service config. Implemented inline rather than via roles so
# this playbook stays readable. When a kind grows, lift it into its own
# tasks/<kind>.yml or role.
# -----------------------------------------------------------------------
- name: Configure postgres
hosts: "{{ veza_container_prefix + 'postgres' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Install postgresql-16
ansible.builtin.apt:
name:
- postgresql-16
- python3-psycopg2 # Required by Ansible's postgresql_user/db modules
state: present
update_cache: true
cache_valid_time: 3600
- name: Ensure postgres is enabled + started
ansible.builtin.systemd:
name: postgresql
state: started
enabled: true
- name: Wait for postgres ready
ansible.builtin.wait_for:
port: 5432
host: 127.0.0.1
timeout: 30
- name: Ensure veza role exists with the vault-stored password
community.postgresql.postgresql_user:
name: veza
password: "{{ vault_postgres_password }}"
role_attr_flags: LOGIN
become_user: postgres
no_log: true
- name: Ensure veza database exists owned by veza role
community.postgresql.postgresql_db:
name: veza
owner: veza
encoding: UTF8
lc_collate: C
lc_ctype: C
template: template0
become_user: postgres
tags: [data, postgres]
- name: Configure redis
hosts: "{{ veza_container_prefix + 'redis' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Install redis-server
ansible.builtin.apt:
name: redis-server
state: present
update_cache: true
cache_valid_time: 3600
- name: Render redis.conf with password
ansible.builtin.copy:
content: |
bind 0.0.0.0
protected-mode yes
port 6379
requirepass {{ vault_redis_password }}
maxmemory 256mb
maxmemory-policy allkeys-lru
appendonly yes
appendfsync everysec
dir /var/lib/redis
dest: /etc/redis/redis.conf
owner: redis
group: redis
mode: "0640"
no_log: true
notify: Restart redis
- name: Ensure redis is enabled + started
ansible.builtin.systemd:
name: redis-server
state: started
enabled: true
- name: Wait for redis ready
ansible.builtin.wait_for:
port: 6379
host: 127.0.0.1
timeout: 30
handlers:
- name: Restart redis
ansible.builtin.systemd:
name: redis-server
state: restarted
tags: [data, redis]
- name: Configure rabbitmq
hosts: "{{ veza_container_prefix + 'rabbitmq' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Install rabbitmq-server
ansible.builtin.apt:
name: rabbitmq-server
state: present
update_cache: true
cache_valid_time: 3600
- name: Ensure rabbitmq is enabled + started
ansible.builtin.systemd:
name: rabbitmq-server
state: started
enabled: true
- name: Wait for rabbitmq ready
ansible.builtin.wait_for:
port: 5672
host: 127.0.0.1
timeout: 60
- name: Ensure /veza vhost exists
community.rabbitmq.rabbitmq_vhost:
name: /veza
state: present
- name: Ensure veza user exists with vault password
community.rabbitmq.rabbitmq_user:
user: veza
password: "{{ vault_rabbitmq_password }}"
vhost: /veza
configure_priv: ".*"
read_priv: ".*"
write_priv: ".*"
state: present
update_password: always
no_log: true
tags: [data, rabbitmq]
- name: Configure minio
hosts: "{{ veza_container_prefix + 'minio' }}"
become: true
gather_facts: false
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Install MinIO via apt (or fallback to direct download)
ansible.builtin.shell: |
set -e
if ! command -v minio >/dev/null 2>&1; then
curl -fsSL https://dl.min.io/server/minio/release/linux-amd64/minio -o /usr/local/bin/minio
chmod 0755 /usr/local/bin/minio
fi
if ! command -v mc >/dev/null 2>&1; then
curl -fsSL https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/local/bin/mc
chmod 0755 /usr/local/bin/mc
fi
args:
executable: /bin/bash
changed_when: false
- name: Ensure minio system user
ansible.builtin.user:
name: minio
system: true
shell: /usr/sbin/nologin
home: /var/lib/minio
- name: Ensure minio data dir
ansible.builtin.file:
path: /var/lib/minio
state: directory
owner: minio
group: minio
mode: "0750"
- name: Render minio EnvironmentFile
ansible.builtin.copy:
content: |
MINIO_ROOT_USER={{ vault_minio_root_user }}
MINIO_ROOT_PASSWORD={{ vault_minio_root_password }}
MINIO_VOLUMES=/var/lib/minio
MINIO_OPTS="--address :9000 --console-address :9001"
dest: /etc/default/minio
owner: root
group: root
mode: "0640"
no_log: true
notify: Restart minio
- name: Render minio systemd unit
ansible.builtin.copy:
content: |
[Unit]
Description=MinIO
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=minio
Group=minio
EnvironmentFile=/etc/default/minio
ExecStart=/usr/local/bin/minio server $MINIO_OPTS $MINIO_VOLUMES
Restart=on-failure
LimitNOFILE=65535
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/minio.service
mode: "0644"
notify:
- Reload systemd
- Restart minio
- name: Enable + start minio
ansible.builtin.systemd:
name: minio
state: started
enabled: true
daemon_reload: true
- name: Wait for minio ready
ansible.builtin.wait_for:
port: 9000
host: 127.0.0.1
timeout: 60
- name: Configure mc client alias
ansible.builtin.shell: |
set -e
mc alias set veza-local http://127.0.0.1:9000 \
"{{ vault_minio_root_user }}" "{{ vault_minio_root_password }}" >/dev/null
args:
executable: /bin/bash
changed_when: false
no_log: true
- name: Ensure veza-{{ veza_env }} bucket exists
ansible.builtin.shell: |
mc mb --ignore-existing veza-local/veza-{{ veza_env }}
args:
executable: /bin/bash
changed_when: false
handlers:
- name: Reload systemd
ansible.builtin.systemd:
daemon_reload: true
- name: Restart minio
ansible.builtin.systemd:
name: minio
state: restarted
tags: [data, minio]

View file

@ -1,113 +0,0 @@
# rollback.yml — two modes :
#
# 1. fast : flip HAProxy back to the previous active color.
# Works only if those containers are still alive
# (i.e., the next deploy has NOT yet recycled them).
# Effect time : ~5 seconds.
#
# 2. full : redeploy a specific release_sha by re-running
# deploy_app.yml with that SHA. Works whenever the
# tarball is still in the Forgejo Registry. Effect
# time : ~5-10 minutes.
#
# Required extra-vars:
# env staging | prod
# mode fast | full
# target_color (mode=fast only) the color to flip TO
# release_sha (mode=full only) the SHA to redeploy
#
# Caller (workflow_dispatch only — see .forgejo/workflows/rollback.yml):
# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
# -e env={{env}} -e mode=fast -e target_color=blue
# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
# -e env={{env}} -e mode=full -e release_sha=<previous_sha>
---
- name: Validate inputs
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Assert env + mode
ansible.builtin.assert:
that:
- veza_env is defined
- veza_env in ['staging', 'prod']
- mode is defined
- mode in ['fast', 'full']
fail_msg: rollback.yml requires veza_env + mode (fast|full).
quiet: true
- name: Assert target_color when mode=fast
ansible.builtin.assert:
that:
- target_color is defined
- target_color in ['blue', 'green']
fail_msg: rollback.yml mode=fast requires target_color (blue|green).
quiet: true
when: mode == 'fast'
- name: Assert release_sha when mode=full
ansible.builtin.assert:
that:
- veza_release_sha is defined
- veza_release_sha | length == 40
fail_msg: rollback.yml mode=full requires release_sha (40-char SHA).
quiet: true
when: mode == 'full'
# ---------------------------------------------------------------------
# mode=fast → HAProxy flip only.
# ---------------------------------------------------------------------
- name: Fast rollback — verify target_color containers are alive
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Check each target-color container exists
ansible.builtin.shell: |
set -e
CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
if ! incus info "$CT" >/dev/null 2>&1; then
echo "MISSING $CT"
exit 1
fi
STATE=$(incus list "$CT" -c s --format csv)
if [ "$STATE" != "RUNNING" ]; then
echo "$CT is $STATE (not RUNNING)"
exit 1
fi
echo "OK $CT"
args:
executable: /bin/bash
loop:
- backend
- stream
- web
changed_when: false
register: alive_check
when: mode == 'fast'
tags: [rollback, fast]
- name: Fast rollback — flip HAProxy
hosts: "{{ veza_container_prefix + 'haproxy' }}"
become: true
gather_facts: true
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
veza_active_color: "{{ target_color }}"
# Fast rollback re-uses the previous SHA from the history file.
veza_release_sha: "{{ lookup('ansible.builtin.file', '/var/lib/veza/active-color.history', errors='ignore') | regex_search('sha=([0-9a-f]+)', '\\1') | default(['rollback'], true) | first }}"
roles:
- veza_haproxy_switch
when: mode == 'fast'
tags: [rollback, fast]
# ---------------------------------------------------------------------
# mode=full → re-import deploy_app.yml with the rollback SHA.
# Functionally identical to a fresh deploy of an older release.
# ---------------------------------------------------------------------
- name: Full rollback — delegate to deploy_app.yml with release_sha={{ veza_release_sha | default('') }}
ansible.builtin.import_playbook: deploy_app.yml
when: mode == 'full'
tags: [rollback, full]

View file

@ -1,16 +1,5 @@
# Managed by Ansible — do not edit by hand.
# v1.0.9 W4 Day 19 (multi-instance) → W5+ extended to blue/green.
# `haproxy_topology` (set in group_vars/<env>.yml) selects between:
#
# multi-instance (default, lab) — server list comes from inventory
# groups backend_api_instances, stream_server_instances ; sticky
# cookie load-balances across N peers.
# blue-green (staging, prod) — server list is exactly two:
# <prefix>backend-blue + <prefix>backend-green. veza_active_color
# picks which one is primary ; the other is `backup` (HAProxy
# routes to a backup server only when ALL primaries are down).
# The veza_haproxy_switch role re-renders this template with a
# new active_color, validates, atomic-swaps, and HUPs.
# v1.0.9 W4 Day 19.
global
log /dev/log local0
@ -21,7 +10,11 @@ global
user haproxy
group haproxy
daemon
# Avoid leaking the version banner in error pages.
server-state-file /var/lib/haproxy/server-state
# ssl-default-bind-* tightens TLS to modern ciphers ; lifted directly
# from the Mozilla Intermediate profile. Only effective when a TLS
# cert is mounted (see haproxy_tls_cert_path).
ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305
@ -30,20 +23,22 @@ defaults
mode http
option httplog
option dontlognull
option forwardfor
option forwardfor # adds X-Forwarded-For so backend logs see the real IP
option http-server-close
timeout connect 5s
timeout client 60s
timeout server 60s
timeout tunnel 1h
timeout tunnel 1h # WS connections are long-lived ; bumped from default 1m
timeout client-fin 5s
timeout http-keep-alive 15s
timeout http-request 10s
# Restore previous server state on reload so health checks don't
# restart from scratch + the drain timer survives.
load-server-state-from-file global
# -----------------------------------------------------------------------
# Stats endpoint — bound to loopback only ; the Prometheus haproxy
# exporter sidecar scrapes it.
# Stats endpoint — bound to loopback only so the prometheus haproxy
# exporter (sidecar) can scrape it. Auth lives at the bridge layer.
# -----------------------------------------------------------------------
frontend stats
bind 127.0.0.1:{{ haproxy_listen_stats }}
@ -55,7 +50,8 @@ frontend stats
no log
# -----------------------------------------------------------------------
# Frontend — HTTP + (optionally) HTTPS. ACL-driven path routing.
# Frontend HTTP. v1.0 lab uses HTTP only ; uncomment the HTTPS bind
# when haproxy_tls_cert_path is non-empty (Mozilla intermediate).
# -----------------------------------------------------------------------
frontend veza_http_in
bind *:{{ haproxy_listen_http }}
@ -65,102 +61,23 @@ frontend veza_http_in
http-request redirect scheme https code 301 if !{ ssl_fc }
{% endif %}
acl is_api path_beg /api/v1
{% if haproxy_topology | default('multi-instance') == 'blue-green' %}
acl is_stream_seg path_beg /tracks/ path_end .m3u8
acl is_stream_seg path_beg /tracks/ path_end .ts
acl is_stream_seg path_beg /tracks/ path_end .m4s
acl is_stream_path path_beg /stream
acl is_stream_path path_beg /hls
use_backend backend_api if is_api
use_backend stream_pool if is_stream_seg
use_backend stream_pool if is_stream_path
default_backend web_pool
{% else %}
# Path-based routing :
# /api/v1/ws/* → backend api_pool (sticky cookie ; carries chat WS)
# /api/v1/* → backend api_pool (also sticky so 401 → /me roundtrips work)
# /tracks/*/hls → backend stream_pool (URI-hash for cache locality)
# else → backend api_pool (default)
acl is_stream path_beg /tracks/ path_end .m3u8
acl is_stream path_beg /tracks/ path_end .ts
acl is_stream path_beg /tracks/ path_end .m4s
use_backend stream_pool if is_stream
default_backend api_pool
{% endif %}
{% if haproxy_topology | default('multi-instance') == 'blue-green' %}
# =======================================================================
# BLUE / GREEN topology (staging, prod)
#
# active_color is the variable veza_haproxy_switch passes in. It selects
# which server gets `check` and which gets `check backup`. HAProxy only
# routes to a `backup` server when EVERY non-backup is marked down by
# its health check ; together with health-check fall=3 this gives us
# instant rollback to the prior color if the new one starts failing
# health checks (without re-running Ansible).
#
# Active color: {{ veza_active_color | default(haproxy_active_color | default('blue')) }}
# Container prefix: {{ veza_container_prefix }}
# DNS suffix: {{ veza_incus_dns_suffix }}
# =======================================================================
{% set _active = veza_active_color | default(haproxy_active_color | default('blue')) %}
# -----------------------------------------------------------------------
# Backend API pool — Go. Sticky cookie ; backup color sits idle.
# -----------------------------------------------------------------------
backend backend_api
balance roundrobin
option httpchk GET {{ veza_healthcheck_paths.backend | default('/api/v1/health') }}
http-check expect status 200
cookie {{ haproxy_sticky_cookie_name }} insert indirect nocache httponly secure
default-server check
inter {{ haproxy_health_check_interval_ms }}
fall {{ haproxy_health_check_fall }}
rise {{ haproxy_health_check_rise }}
on-marked-down shutdown-sessions
slowstart {{ haproxy_graceful_drain_seconds }}s
server backend_blue {{ veza_container_prefix }}backend-blue.{{ veza_incus_dns_suffix }}:{{ veza_backend_port }} cookie backend_blue {{ '' if _active == 'blue' else 'backup' }}
server backend_green {{ veza_container_prefix }}backend-green.{{ veza_incus_dns_suffix }}:{{ veza_backend_port }} cookie backend_green {{ '' if _active == 'green' else 'backup' }}
# -----------------------------------------------------------------------
# Stream pool — Rust Axum HLS. URI-hash for cache locality. Same
# blue/green pair, same backup-flag pattern.
# -----------------------------------------------------------------------
backend stream_pool
balance uri whole
hash-type consistent
option httpchk GET {{ veza_healthcheck_paths.stream | default('/health') }}
http-check expect status 200
timeout tunnel 1h
default-server check
inter {{ haproxy_health_check_interval_ms }}
fall {{ haproxy_health_check_fall }}
rise {{ haproxy_health_check_rise }}
on-marked-down shutdown-sessions
slowstart {{ haproxy_graceful_drain_seconds }}s
server stream_blue {{ veza_container_prefix }}stream-blue.{{ veza_incus_dns_suffix }}:{{ veza_stream_port }} {{ '' if _active == 'blue' else 'backup' }}
server stream_green {{ veza_container_prefix }}stream-green.{{ veza_incus_dns_suffix }}:{{ veza_stream_port }} {{ '' if _active == 'green' else 'backup' }}
# -----------------------------------------------------------------------
# Web pool — React SPA served by nginx. Same pair, same pattern.
# -----------------------------------------------------------------------
backend web_pool
balance roundrobin
option httpchk GET {{ veza_healthcheck_paths.web | default('/') }}
http-check expect status 200
default-server check
inter {{ haproxy_health_check_interval_ms }}
fall {{ haproxy_health_check_fall }}
rise {{ haproxy_health_check_rise }}
on-marked-down shutdown-sessions
slowstart {{ haproxy_graceful_drain_seconds }}s
server web_blue {{ veza_container_prefix }}web-blue.{{ veza_incus_dns_suffix }}:{{ veza_web_port }} {{ '' if _active == 'blue' else 'backup' }}
server web_green {{ veza_container_prefix }}web-green.{{ veza_incus_dns_suffix }}:{{ veza_web_port }} {{ '' if _active == 'green' else 'backup' }}
{% else %}
# =======================================================================
# MULTI-INSTANCE topology (lab, default)
# Server list comes from inventory groups ; sticky cookie load-balances.
# =======================================================================
# -----------------------------------------------------------------------
# Backend api_pool — Gin REST API. Sticky cookie + active health check.
# `cookie ... insert indirect nocache` : HAProxy sets the cookie on the
# first response, the browser sends it back, subsequent requests stick
# to the same server. WS upgrades inherit it.
# -----------------------------------------------------------------------
backend api_pool
balance roundrobin
@ -181,7 +98,9 @@ backend api_pool
# -----------------------------------------------------------------------
# Backend stream_pool — Rust Axum HLS. URI hash so the same track_id
# consistently lands on the same node.
# consistently lands on the same node, keeping the in-process HLS
# segment cache warm. `consistent` flag = jump-hash so adding/removing
# a node doesn't flush the entire pool.
# -----------------------------------------------------------------------
backend stream_pool
balance uri whole
@ -199,5 +118,3 @@ backend stream_pool
{% for host in stream_hosts %}
server {{ host }} {{ host }}.lxd:{{ haproxy_stream_server_port }}
{% endfor %}
{% endif %}

View file

@ -35,9 +35,7 @@ veza_app_binary_mode: "0755"
veza_app_container_name: "{{ veza_container_prefix }}{{ veza_component }}-{{ veza_target_color }}"
# URL to fetch the release tarball. Computed once per task chain.
# `veza-<component>` is the Forgejo package name (one package per
# component) ; SHA is the version ; tarball is the filename.
veza_app_artifact_url: "{{ veza_artifact_base_url }}/veza-{{ veza_component }}/{{ veza_release_sha }}/veza-{{ veza_component }}-{{ veza_release_sha }}.tar.zst"
veza_app_artifact_url: "{{ veza_artifact_base_url }}/{{ veza_component }}/{{ veza_release_sha }}/veza-{{ veza_component }}-{{ veza_release_sha }}.tar.zst"
# How long to wait for the container's network namespace to come up
# after `incus launch` before we start running tasks against it.

View file

@ -1,47 +0,0 @@
# `veza_haproxy_switch` role
Atomically swap HAProxy's active color. Runs against the
`{{ veza_container_prefix }}haproxy` container after `veza_app` has
recreated + health-probed all three components in the inactive color.
## Why a separate role from `haproxy`?
- `roles/haproxy` provisions a fresh HAProxy container — install
the package, lay down the *initial* config, enable the systemd
unit. It runs once when the staging/prod env is bootstrapped and
occasionally when the global config shape changes.
- `roles/veza_haproxy_switch` performs the *per-deploy* delta —
re-template the cfg with a new `veza_active_color`, validate,
swap, HUP. It runs once at the end of every successful deploy.
Splitting them keeps the per-deploy path narrow (no apt, no service
install) and lets `roles/haproxy` remain idempotent when the global
shape hasn't changed.
## Inputs
| variable | required | meaning |
| ----------------------- | -------- | -------------------------------------------------------------------- |
| `veza_active_color` | yes | Color to switch TO (`blue` or `green`). Becomes the new active. |
| `veza_release_sha` | yes | SHA being deployed. Logged in the active-color history file. |
| `veza_container_prefix` | inherit | From group_vars/<env>.yml. |
| `haproxy_topology` | inherit | Should be `blue-green` for this role to make sense. |
## Failure semantics
The render → validate → atomic-swap → HUP sequence runs in an
Ansible `block:` with a `rescue:` that restores `haproxy.cfg.bak`
(captured before the swap) and re-HUPs. So an invalid config or a
HUP failure leaves HAProxy serving the *previous* active color
exactly as before — the deploy as a whole then fails on the playbook
level.
## What the role does NOT do
- It does not destroy or recreate the HAProxy container. That's a
one-time operation under `roles/haproxy`.
- It does not touch app containers — by the time this role runs,
blue/green app containers are both healthy.
- It does not remove the previously-active color's containers. They
survive (intentional) so a rollback can flip back instantly. The
next deploy naturally recycles them.

View file

@ -1,18 +0,0 @@
---
# These should be set by the caller — defaults here are guards that
# fail loud if the caller forgot to pass them.
veza_active_color: ""
veza_release_sha: ""
# Paths inside the HAProxy container.
haproxy_cfg_path: /etc/haproxy/haproxy.cfg
haproxy_cfg_new_path: /etc/haproxy/haproxy.cfg.new
haproxy_cfg_backup_path: /etc/haproxy/haproxy.cfg.bak
haproxy_state_dir: /var/lib/veza
haproxy_active_color_file: /var/lib/veza/active-color
haproxy_active_color_history: /var/lib/veza/active-color.history
# How many history entries to keep before pruning. The rollback role
# offers point-in-time switch within this window without redeploying
# the artefact.
haproxy_active_color_history_keep: 5

View file

@ -1,9 +0,0 @@
---
# HUP haproxy via systemd reload (graceful — drains old workers).
# Used both on success (after atomic swap) and on rescue (after
# restoring backup).
- name: Reload haproxy
ansible.builtin.systemd:
name: haproxy
state: reloaded
listen: "veza-haproxy reload"

View file

@ -1,16 +0,0 @@
---
galaxy_info:
role_name: veza_haproxy_switch
author: Veza Ops
description: >-
Atomically swap HAProxy's active color (blue/green) and persist
the new state. Runs once per deploy, after veza_app has health-
probed all components in the inactive color. Block/rescue
guarantees HAProxy never lands on a bad config.
license: proprietary
min_ansible_version: "2.15"
platforms:
- name: Debian
versions: ["13"]
dependencies: []

View file

@ -1,142 +0,0 @@
# Atomic blue/green switch. The HAProxy template lives in
# roles/haproxy/templates/haproxy.cfg.j2 — it reads veza_active_color
# to render the right `backup` directives. We re-template, validate,
# atomic-swap, HUP.
#
# Block/rescue: any failure in the four-step sequence restores
# haproxy.cfg from the backup we capture before touching anything.
# That way, an invalid template or a HUP error never leaves HAProxy
# serving from a stale or broken cfg — it stays on whatever was
# active when the role started.
---
- name: Validate inputs
ansible.builtin.assert:
that:
- veza_active_color in ['blue', 'green']
- veza_release_sha | length == 40
fail_msg: >-
veza_haproxy_switch role requires veza_active_color (blue|green)
and veza_release_sha (40-char git SHA). Got: color={{ veza_active_color }}
sha={{ veza_release_sha }}.
quiet: true
tags: [veza_haproxy_switch, always]
- name: Ensure veza state dir exists in HAProxy container
ansible.builtin.file:
path: "{{ haproxy_state_dir }}"
state: directory
owner: root
group: root
mode: "0755"
tags: [veza_haproxy_switch]
- name: Read currently-active color (if any)
ansible.builtin.slurp:
src: "{{ haproxy_active_color_file }}"
register: prior_color_raw
failed_when: false
changed_when: false
tags: [veza_haproxy_switch]
- name: Resolve prior_active_color (default blue if no history)
ansible.builtin.set_fact:
prior_active_color: >-
{{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
else 'blue' }}
tags: [veza_haproxy_switch]
- name: Switch sequence (block/rescue — restores cfg on any failure)
block:
- name: Backup current haproxy.cfg
ansible.builtin.copy:
src: "{{ haproxy_cfg_path }}"
dest: "{{ haproxy_cfg_backup_path }}"
remote_src: true
mode: "0640"
tags: [veza_haproxy_switch]
- name: Render fresh haproxy.cfg with new active_color
ansible.builtin.template:
src: "{{ playbook_dir }}/../roles/haproxy/templates/haproxy.cfg.j2"
dest: "{{ haproxy_cfg_new_path }}"
owner: root
group: haproxy
mode: "0640"
validate: "haproxy -f %s -c -q"
vars:
# Make absolutely sure the template sees the new color we are
# switching to — set both names because the older template
# used `veza_active_color` and a future revision might use
# `haproxy_active_color`.
haproxy_active_color: "{{ veza_active_color }}"
tags: [veza_haproxy_switch]
- name: Atomic swap — mv haproxy.cfg.new → haproxy.cfg
ansible.builtin.command: mv -f "{{ haproxy_cfg_new_path }}" "{{ haproxy_cfg_path }}"
changed_when: true
tags: [veza_haproxy_switch]
- name: HUP haproxy (graceful reload, no connection drop)
ansible.builtin.systemd:
name: haproxy
state: reloaded
tags: [veza_haproxy_switch]
rescue:
- name: Restore haproxy.cfg from backup
ansible.builtin.command: mv -f "{{ haproxy_cfg_backup_path }}" "{{ haproxy_cfg_path }}"
when: haproxy_cfg_backup_path is file or true # always try; benign if backup missing
changed_when: true
tags: [veza_haproxy_switch]
- name: HUP haproxy back to the prior config
ansible.builtin.systemd:
name: haproxy
state: reloaded
failed_when: false
tags: [veza_haproxy_switch]
- name: Report the failure
ansible.builtin.fail:
msg: >-
HAProxy switch to color {{ veza_active_color }} (sha
{{ veza_release_sha[:12] }}) failed — config rolled back
to the prior state. HAProxy continues serving from
{{ prior_active_color }}. Inspect the validate step's
stderr in the playbook output above.
# Success path: persist new active color + history.
- name: Write new active color
ansible.builtin.copy:
dest: "{{ haproxy_active_color_file }}"
content: "{{ veza_active_color }}\n"
owner: root
group: root
mode: "0644"
tags: [veza_haproxy_switch]
- name: Append to active-color history
ansible.builtin.lineinfile:
path: "{{ haproxy_active_color_history }}"
line: "{{ ansible_date_time.iso8601 }} sha={{ veza_release_sha }} color={{ veza_active_color }} prior={{ prior_active_color }}"
create: true
insertbefore: BOF
mode: "0644"
tags: [veza_haproxy_switch]
- name: Prune history beyond keep limit
ansible.builtin.shell: |
set -e
if [ -f "{{ haproxy_active_color_history }}" ]; then
head -n {{ haproxy_active_color_history_keep }} "{{ haproxy_active_color_history }}" > "{{ haproxy_active_color_history }}.tmp"
mv -f "{{ haproxy_active_color_history }}.tmp" "{{ haproxy_active_color_history }}"
fi
args:
executable: /bin/bash
changed_when: false
tags: [veza_haproxy_switch]
- name: Drop the now-stale backup
ansible.builtin.file:
path: "{{ haproxy_cfg_backup_path }}"
state: absent
tags: [veza_haproxy_switch]

View file

@ -198,22 +198,3 @@ incus-logs: ## [LOW] Show logs from Incus container (usage: make incus-logs SERV
exit 1; \
fi
@incus exec veza-$(SERVICE) -- journalctl -f
# ==============================================================================
# CANARY RELEASE (W5 Day 23)
# ==============================================================================
.PHONY: deploy-canary
deploy-canary: ## [HIGH] Canary release : drain → deploy → SLI monitor → rollback on red. ARTIFACT=/path required. See docs/CANARY_RELEASE.md.
@if [ -z "$(ARTIFACT)" ]; then \
$(ECHO_CMD) "${RED}❌ ARTIFACT=/path/to/veza-api required${NC}"; \
$(ECHO_CMD) "${YELLOW} See docs/CANARY_RELEASE.md for the full env-var surface.${NC}"; \
exit 1; \
fi
@$(ECHO_CMD) "${BLUE}🚦 Canary deploy : $(ARTIFACT)${NC}"
@ARTIFACT="$(ARTIFACT)" \
ROLLBACK_BINARY="$(ROLLBACK_BINARY)" \
SLI_WINDOW="$(SLI_WINDOW)" \
PROM_URL="$(PROM_URL)" \
bash $(CURDIR)/scripts/deploy-canary.sh

View file

@ -1,112 +0,0 @@
#!/usr/bin/env bash
# check-migration-backward-compat.sh — pre-deploy gate for canary releases.
#
# Refuses to deploy when the latest migration is NOT backward-compatible
# with the running schema. Backward-compat = the OLD code can still
# read/write against the NEW schema for at least one canary window
# (otherwise canary mode is meaningless ; the old node would crash on
# the first request that touches a removed column).
#
# Heuristic : reject migrations that contain any of these patterns :
# - DROP COLUMN
# - DROP TABLE
# - ALTER COLUMN ... TYPE (type change is rarely backward-compat)
# - ADD COLUMN ... NOT NULL (without DEFAULT — old code can't INSERT)
# - DROP CONSTRAINT
# - DROP INDEX UNIQUE (existing data may already violate)
#
# This is a STATIC check ; some patterns are false-positives (e.g.
# DROP COLUMN of a column that no code reads). When a real migration
# is flagged, the operator either :
# 1. Splits the migration : ship the additive part now, drop in v+1
# after old-version backends are decommissioned.
# 2. Bypasses with FORCE_MIGRATE=1 + a justification in the commit
# message of the migration file.
#
# v1.0.9 W5 Day 23.
#
# Usage :
# bash scripts/check-migration-backward-compat.sh
#
# Required env :
# MIGRATIONS_DIR default veza-backend-api/migrations
# GIT_RANGE default origin/main..HEAD ; the range to inspect for
# newly-added migration files
# Optional env :
# FORCE_MIGRATE=1 bypass with a logged warning. Use sparingly.
#
# Exit codes :
# 0 — all new migrations are backward-compat (or FORCE_MIGRATE=1)
# 1 — at least one migration carries a forbidden pattern
# 3 — required tool missing / config error
set -euo pipefail
MIGRATIONS_DIR=${MIGRATIONS_DIR:-veza-backend-api/migrations}
GIT_RANGE=${GIT_RANGE:-origin/main..HEAD}
FORCE_MIGRATE=${FORCE_MIGRATE:-0}
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
fail() { log "FAIL: $*"; exit "${2:-1}"; }
require() {
command -v "$1" >/dev/null 2>&1 || fail "required tool missing: $1" 3
}
require git
require grep
require date
# Patterns that indicate non-backward-compat schema change.
# Heredoc preserves the pipe characters as alternations.
FORBIDDEN_PATTERNS='DROP COLUMN|DROP TABLE|ALTER COLUMN [A-Za-z_]+ TYPE|ADD COLUMN [A-Za-z_]+ [^,;]* NOT NULL[^,;]*(;|$)|DROP CONSTRAINT|DROP INDEX [A-Za-z_]*UNIQUE'
# Identify newly-added migration files in the current range.
new_migrations=$(git diff --name-only --diff-filter=A "$GIT_RANGE" -- "$MIGRATIONS_DIR" 2>/dev/null \
| grep -E "^${MIGRATIONS_DIR}/[0-9]+_.*\.sql$" || true)
if [ -z "$new_migrations" ]; then
log "no new migrations in $GIT_RANGE — nothing to check"
exit 0
fi
log "checking $(echo "$new_migrations" | wc -l) new migration(s) in $GIT_RANGE"
findings=0
for f in $new_migrations; do
log " scanning $f"
# -i case-insensitive ; -E extended regex ; -n line numbers
matches=$(grep -inE "$FORBIDDEN_PATTERNS" "$f" || true)
if [ -n "$matches" ]; then
findings=$((findings + 1))
log ""
log " ⚠ NON-BACKWARD-COMPAT pattern in $f :"
echo "$matches" | sed 's/^/ /' >&2
# Special case : ADD COLUMN ... NOT NULL ... DEFAULT <x> is fine.
# The regex above tries to exclude that but the match-then-filter
# approach is more reliable than a single regex. Suppress matches
# that include `DEFAULT` on the same line.
real=$(echo "$matches" | grep -ivE "DEFAULT" || true)
if [ -z "$real" ]; then
log " ↳ all matches include DEFAULT clause — actually backward-compat"
findings=$((findings - 1))
fi
fi
done
if [ "$findings" -gt 0 ]; then
log ""
log "$findings migration(s) flagged as potentially non-backward-compat."
if [ "$FORCE_MIGRATE" = "1" ]; then
log "FORCE_MIGRATE=1 set — proceeding anyway."
exit 0
fi
log ""
log "Options to proceed :"
log " 1. Split the migration : ship the additive part now, drop the"
log " non-compat part in v+1 after old backends are off."
log " 2. Set FORCE_MIGRATE=1 if you accept the risk + document the"
log " justification in the migration's commit message."
exit 1
fi
log "PASS : all new migrations are backward-compat"
exit 0

View file

@ -1,287 +0,0 @@
#!/usr/bin/env bash
# deploy-canary.sh — canary release for the active/active backend-api fleet.
#
# Walks the standard canary recipe (drain → deploy → health → re-enable
# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
# the host that owns the backend-api Incus containers + the haproxy
# admin socket.
#
# v1.0.9 W5 Day 23.
#
# Usage :
# bash scripts/deploy-canary.sh /path/to/new/veza-api
#
# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
#
# Required env :
# ARTIFACT path to the new veza-api binary (passed as $1 too)
# Optional env :
# POOL_BACKEND HAProxy backend name (default api_pool)
# CANARY_NODE which container to canary first (default backend-api-2)
# PEER_NODES comma-separated list of peers to roll AFTER canary
# succeeds (default backend-api-1)
# HEALTH_HOST host to curl (default haproxy.lxd ; LB-routed)
# HEALTH_PATH default /api/v1/health
# SLI_WINDOW SLI monitor duration in seconds (default 3600 = 1h)
# SLI_PROBE_INTERVAL seconds between SLI probes (default 30)
# PROM_URL Prometheus query URL (default http://prom.lxd:9090)
# PROM_P95_THRESHOLD_S p95 SLI threshold in seconds (default 0.5)
# PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
# ROLLBACK_BINARY path to the previous-known-good binary (used on red)
# If unset, rollback skips the binary swap and just
# re-enables the canary node — operator handles the
# real revert.
# PRE_DEPLOY_HOOK path to script that validates migrations are
# backward-compat. Defaults to scripts/check-migration-backward-compat.sh
# when present.
#
# Exit codes :
# 0 — canary + full roll succeeded
# 1 — pre-deploy validation failed ; nothing was changed
# 2 — canary failed ; rollback executed
# 3 — required tool / env missing
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
ARTIFACT=${ARTIFACT:-${1:-?}}
POOL_BACKEND=${POOL_BACKEND:-api_pool}
CANARY_NODE=${CANARY_NODE:-backend-api-2}
PEER_NODES=${PEER_NODES:-backend-api-1}
HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
SLI_WINDOW=${SLI_WINDOW:-3600}
SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
PROM_URL=${PROM_URL:-http://prom.lxd:9090}
PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
die() { log "FAIL: $*"; exit "${2:-1}"; }
require() {
command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
}
require incus
require curl
require socat
require date
if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
die "ARTIFACT (or \$1) must point to an existing binary" 1
fi
# --------------------------------------------------------------------
# Helpers : HAProxy admin socket commands.
# --------------------------------------------------------------------
HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}
ha_cmd() {
incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
}
ha_state() {
local node=$1
ha_cmd "show servers state $POOL_BACKEND" \
| awk -v n="$node" '$0 ~ n {print $7}' | head -1
# field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
}
ha_drain() {
log "haproxy : drain $1"
ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
}
ha_ready() {
log "haproxy : ready $1"
ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
}
ha_wait_drained() {
# Drain finishes when the server reports 0 active connections.
local node=$1
local deadline=$(( $(date +%s) + 60 ))
while [ "$(date +%s)" -lt "$deadline" ]; do
local n
n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
if [ "${n:-0}" = "0" ]; then
log "haproxy : $node drained (0 active connections)"
return 0
fi
sleep 2
done
log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
}
curl_health() {
curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
"http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
}
# --------------------------------------------------------------------
# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
# any probe reports red so we can rollback fast.
# --------------------------------------------------------------------
prom_query() {
local q=$1
curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
"${PROM_URL}/api/v1/query" 2>/dev/null \
| jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
}
monitor_sli() {
log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
local deadline=$(( $(date +%s) + SLI_WINDOW ))
local probes=0
local first_red=""
while [ "$(date +%s)" -lt "$deadline" ]; do
probes=$((probes + 1))
local p95 err
p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
log " probe $probes : p95=${p95}s err=${err}"
# awk used for float comparison ; bash test only does integers.
if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
break
fi
if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
break
fi
sleep "$SLI_PROBE_INTERVAL"
done
if [ -n "$first_red" ]; then
log "SLI red after $probes probe(s) : $first_red"
return 1
fi
log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
return 0
}
# --------------------------------------------------------------------
# Deploy + rollback primitives.
# --------------------------------------------------------------------
deploy_to() {
local node=$1
local artifact=$2
log "deploying $artifact$node"
incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
--uid 1001 --gid 1001 --mode 0755
incus exec "$node" -- systemctl restart veza-backend-api
}
verify_node_health() {
local node=$1
log "node health check : $node"
local deadline=$(( $(date +%s) + 60 ))
while [ "$(date +%s)" -lt "$deadline" ]; do
if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
log " $node : 200"
return 0
fi
sleep 2
done
return 1
}
rollback_canary() {
log "ROLLBACK : restoring $CANARY_NODE"
if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
else
log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
fi
ha_ready "$CANARY_NODE"
}
# --------------------------------------------------------------------
# 1. Pre-deploy hook (migration backward-compat).
# --------------------------------------------------------------------
log "step 1 : pre-deploy hook"
if [ -x "$PRE_DEPLOY_HOOK" ]; then
if ! "$PRE_DEPLOY_HOOK"; then
die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
fi
else
log " PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
fi
# --------------------------------------------------------------------
# 2. Drain canary node.
# --------------------------------------------------------------------
log "step 2 : drain $CANARY_NODE in HAProxy"
ha_drain "$CANARY_NODE"
ha_wait_drained "$CANARY_NODE"
# --------------------------------------------------------------------
# 3. Deploy artifact to the canary node.
# --------------------------------------------------------------------
log "step 3 : deploy artifact to $CANARY_NODE"
deploy_to "$CANARY_NODE" "$ARTIFACT"
# --------------------------------------------------------------------
# 4. Per-node health check.
# --------------------------------------------------------------------
log "step 4 : health check on $CANARY_NODE"
if ! verify_node_health "$CANARY_NODE"; then
log "$CANARY_NODE failed health check post-deploy"
rollback_canary
exit 2
fi
# --------------------------------------------------------------------
# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
# --------------------------------------------------------------------
log "step 5 : re-enable $CANARY_NODE in HAProxy"
ha_ready "$CANARY_NODE"
sleep 5
lb_status=$(curl_health)
if [ "$lb_status" != "200" ]; then
log "LB health check after re-enable returned $lb_status ; rolling back"
rollback_canary
exit 2
fi
# --------------------------------------------------------------------
# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
# --------------------------------------------------------------------
log "step 6 : monitor SLI on the canary"
if ! monitor_sli; then
log "SLI red — rolling back the canary"
rollback_canary
exit 2
fi
# --------------------------------------------------------------------
# 7. SLI green — repeat on each peer.
# --------------------------------------------------------------------
log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
IFS=',' read -ra peers <<< "$PEER_NODES"
for peer in "${peers[@]}"; do
log "── peer $peer ───────────────────────────"
ha_drain "$peer"
ha_wait_drained "$peer"
deploy_to "$peer" "$ARTIFACT"
if ! verify_node_health "$peer"; then
log "$peer health check failed post-deploy"
log "WARN : leaving $peer drained ; canary node still serves traffic"
log " operator must re-deploy known-good binary or repair $peer manually"
exit 2
fi
ha_ready "$peer"
sleep 5
lb_status=$(curl_health)
if [ "$lb_status" != "200" ]; then
log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
exit 2
fi
done
log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
exit 0