Compare commits
8 commits
70df301823
...
172729bdff
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
172729bdff | ||
|
|
8200eeba6e | ||
|
|
989d88236b | ||
|
|
3a67763d6f | ||
|
|
02ce938b3f | ||
|
|
257ea4b159 | ||
|
|
9f5e9c9c38 | ||
|
|
4acbcc170a |
25 changed files with 2744 additions and 66 deletions
79
.forgejo/workflows/cleanup-failed.yml
Normal file
79
.forgejo/workflows/cleanup-failed.yml
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# cleanup-failed.yml — workflow_dispatch only.
|
||||
#
|
||||
# Tears down the kept-alive failed-deploy color (the inactive one
|
||||
# that survived a Phase D / Phase F failure for forensics).
|
||||
# Operator triggers this once they have read the journalctl output.
|
||||
#
|
||||
# Hard safety in playbooks/cleanup_failed.yml: refuses to destroy
|
||||
# the currently-active color.
|
||||
name: Veza cleanup failed-deploy color
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
env:
|
||||
description: "Environment to clean up"
|
||||
required: true
|
||||
type: choice
|
||||
options: [staging, prod]
|
||||
color:
|
||||
description: "Color to destroy (must NOT be the active one)"
|
||||
required: true
|
||||
type: choice
|
||||
options: [blue, green]
|
||||
|
||||
concurrency:
|
||||
group: cleanup-${{ inputs.env }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
name: Destroy ${{ inputs.color }} app containers in ${{ inputs.env }}
|
||||
runs-on: [self-hosted, incus]
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install ansible
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y ansible
|
||||
ansible-galaxy collection install community.general
|
||||
|
||||
- name: Write vault password
|
||||
env:
|
||||
VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
|
||||
run: |
|
||||
printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
|
||||
chmod 0400 "$RUNNER_TEMP/vault-pass"
|
||||
echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run cleanup_failed.yml
|
||||
working-directory: infra/ansible
|
||||
env:
|
||||
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-cleanup-${{ inputs.env }}-${{ inputs.color }}.log
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
run: |
|
||||
ansible-playbook \
|
||||
-i inventory/${{ inputs.env }}.yml \
|
||||
playbooks/cleanup_failed.yml \
|
||||
--vault-password-file "$VAULT_PASS_FILE" \
|
||||
-e veza_env=${{ inputs.env }} \
|
||||
-e target_color=${{ inputs.color }}
|
||||
|
||||
- name: Upload Ansible log
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ansible-cleanup-${{ inputs.env }}-${{ inputs.color }}
|
||||
path: ${{ runner.temp }}/ansible-cleanup-*.log
|
||||
retention-days: 30
|
||||
|
||||
- name: Shred vault password file
|
||||
if: always()
|
||||
run: |
|
||||
if [ -f "$VAULT_PASS_FILE" ]; then
|
||||
shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
|
||||
fi
|
||||
358
.forgejo/workflows/deploy.yml
Normal file
358
.forgejo/workflows/deploy.yml
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
# Veza deploy pipeline.
|
||||
#
|
||||
# Triggers (intentionally narrow — see SECURITY note below):
|
||||
# push:main → env=staging, sha=$GITHUB_SHA
|
||||
# push:tags ['v*'] → env=prod, sha=$GITHUB_SHA (tag's pointee)
|
||||
# workflow_dispatch → operator-supplied env + sha
|
||||
#
|
||||
# SECURITY: this workflow runs on a self-hosted runner with access to
|
||||
# the Incus unix socket (effectively root on the host). DO NOT add
|
||||
# `pull_request` or any fork-influenced trigger here — an attacker-
|
||||
# controlled fork would be able to `incus exec` arbitrarily. The
|
||||
# narrow trigger list above is the security boundary.
|
||||
#
|
||||
# Sequence : build (3 jobs in parallel) → upload artifacts → deploy.
|
||||
name: Veza deploy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
tags: ['v*']
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
env:
|
||||
description: "Environment to deploy"
|
||||
required: true
|
||||
default: staging
|
||||
type: choice
|
||||
options: [staging, prod]
|
||||
release_sha:
|
||||
description: "Full git SHA to deploy (defaults to current HEAD if empty)"
|
||||
required: false
|
||||
type: string
|
||||
|
||||
concurrency:
|
||||
# Only one deploy per env at a time. Newer pushes cancel older
|
||||
# in-flight builds for the same env (the user almost always wants
|
||||
# the newer commit).
|
||||
group: deploy-${{ github.ref_type == 'tag' && 'prod' || 'staging' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
# Where build artefacts land. Set in Forgejo repo Variables :
|
||||
# FORGEJO_REGISTRY_URL = https://forgejo.veza.fr/api/packages/talas/generic
|
||||
REGISTRY_URL: ${{ vars.FORGEJO_REGISTRY_URL }}
|
||||
|
||||
jobs:
|
||||
# =================================================================
|
||||
# Resolve env + sha from the trigger.
|
||||
# =================================================================
|
||||
resolve:
|
||||
name: Resolve env + SHA
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
env: ${{ steps.r.outputs.env }}
|
||||
sha: ${{ steps.r.outputs.sha }}
|
||||
steps:
|
||||
- name: Resolve
|
||||
id: r
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
ENV="${{ inputs.env }}"
|
||||
SHA="${{ inputs.release_sha || github.sha }}"
|
||||
elif [ "${{ github.ref_type }}" = "tag" ]; then
|
||||
ENV="prod"
|
||||
SHA="${{ github.sha }}"
|
||||
else
|
||||
ENV="staging"
|
||||
SHA="${{ github.sha }}"
|
||||
fi
|
||||
if ! echo "$SHA" | grep -Eq '^[0-9a-f]{40}$'; then
|
||||
echo "SHA '$SHA' is not a 40-char git SHA"
|
||||
exit 1
|
||||
fi
|
||||
echo "env=$ENV" >> "$GITHUB_OUTPUT"
|
||||
echo "sha=$SHA" >> "$GITHUB_OUTPUT"
|
||||
echo "Resolved env=$ENV sha=$SHA"
|
||||
|
||||
# =================================================================
|
||||
# Build backend (Go).
|
||||
# =================================================================
|
||||
build-backend:
|
||||
name: Build backend
|
||||
needs: resolve
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
ref: ${{ needs.resolve.outputs.sha }}
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: "1.25"
|
||||
cache: true
|
||||
cache-dependency-path: veza-backend-api/go.sum
|
||||
|
||||
- name: Test
|
||||
working-directory: veza-backend-api
|
||||
env:
|
||||
VEZA_SKIP_INTEGRATION: "1"
|
||||
run: go test ./... -short -count=1 -timeout 300s
|
||||
|
||||
- name: Build veza-api (CGO=0, static)
|
||||
working-directory: veza-backend-api
|
||||
env:
|
||||
CGO_ENABLED: "0"
|
||||
GOOS: linux
|
||||
GOARCH: amd64
|
||||
run: |
|
||||
go build -trimpath -ldflags "-s -w" \
|
||||
-o ./bin/veza-api ./cmd/api/main.go
|
||||
go build -trimpath -ldflags "-s -w" \
|
||||
-o ./bin/migrate_tool ./cmd/migrate_tool/main.go
|
||||
|
||||
- name: Stage tarball contents
|
||||
working-directory: veza-backend-api
|
||||
run: |
|
||||
STAGE="$RUNNER_TEMP/veza-backend"
|
||||
mkdir -p "$STAGE/migrations"
|
||||
cp ./bin/veza-api ./bin/migrate_tool "$STAGE/"
|
||||
cp -r ./migrations/* "$STAGE/migrations/" || true
|
||||
echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
|
||||
|
||||
- name: Pack tarball
|
||||
run: |
|
||||
cd "$RUNNER_TEMP"
|
||||
tar --use-compress-program=zstd -cf \
|
||||
"veza-backend-${{ needs.resolve.outputs.sha }}.tar.zst" \
|
||||
-C "$RUNNER_TEMP/veza-backend" .
|
||||
|
||||
- name: Push to Forgejo Package Registry
|
||||
env:
|
||||
TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
set -e
|
||||
TARBALL="veza-backend-${{ needs.resolve.outputs.sha }}.tar.zst"
|
||||
URL="${REGISTRY_URL}/veza-backend/${{ needs.resolve.outputs.sha }}/${TARBALL}"
|
||||
echo "PUT → $URL"
|
||||
curl -fsSL --fail-with-body -X PUT \
|
||||
-H "Authorization: token ${TOKEN}" \
|
||||
--upload-file "$RUNNER_TEMP/${TARBALL}" \
|
||||
"${URL}"
|
||||
|
||||
# =================================================================
|
||||
# Build stream (Rust).
|
||||
# =================================================================
|
||||
build-stream:
|
||||
name: Build stream
|
||||
needs: resolve
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
ref: ${{ needs.resolve.outputs.sha }}
|
||||
|
||||
- name: Set up Rust toolchain
|
||||
run: |
|
||||
command -v rustup >/dev/null || \
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
|
||||
source "$HOME/.cargo/env"
|
||||
rustup target add x86_64-unknown-linux-musl
|
||||
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
|
||||
sudo apt-get update -qq && sudo apt-get install -y musl-tools
|
||||
|
||||
- name: Cache cargo + target
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
veza-stream-server/target
|
||||
key: deploy-${{ runner.os }}-cargo-${{ hashFiles('veza-stream-server/Cargo.lock') }}
|
||||
restore-keys: |
|
||||
deploy-${{ runner.os }}-cargo-
|
||||
|
||||
- name: Test
|
||||
working-directory: veza-stream-server
|
||||
run: cargo test --workspace
|
||||
|
||||
- name: Build stream_server (musl static)
|
||||
working-directory: veza-stream-server
|
||||
run: |
|
||||
cargo build --release --locked \
|
||||
--target x86_64-unknown-linux-musl
|
||||
|
||||
- name: Stage tarball contents
|
||||
working-directory: veza-stream-server
|
||||
run: |
|
||||
STAGE="$RUNNER_TEMP/veza-stream"
|
||||
mkdir -p "$STAGE"
|
||||
cp ./target/x86_64-unknown-linux-musl/release/stream_server "$STAGE/"
|
||||
echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
|
||||
|
||||
- name: Pack tarball
|
||||
run: |
|
||||
cd "$RUNNER_TEMP"
|
||||
tar --use-compress-program=zstd -cf \
|
||||
"veza-stream-${{ needs.resolve.outputs.sha }}.tar.zst" \
|
||||
-C "$RUNNER_TEMP/veza-stream" .
|
||||
|
||||
- name: Push to Forgejo Package Registry
|
||||
env:
|
||||
TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
set -e
|
||||
TARBALL="veza-stream-${{ needs.resolve.outputs.sha }}.tar.zst"
|
||||
URL="${REGISTRY_URL}/veza-stream/${{ needs.resolve.outputs.sha }}/${TARBALL}"
|
||||
echo "PUT → $URL"
|
||||
curl -fsSL --fail-with-body -X PUT \
|
||||
-H "Authorization: token ${TOKEN}" \
|
||||
--upload-file "$RUNNER_TEMP/${TARBALL}" \
|
||||
"${URL}"
|
||||
|
||||
# =================================================================
|
||||
# Build web (React/Vite).
|
||||
# =================================================================
|
||||
build-web:
|
||||
name: Build web
|
||||
needs: resolve
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
ref: ${{ needs.resolve.outputs.sha }}
|
||||
|
||||
- name: Use Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "20"
|
||||
cache: "npm"
|
||||
cache-dependency-path: package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build design tokens
|
||||
run: npm run build:tokens --workspace=@veza/design-system
|
||||
|
||||
- name: Build SPA
|
||||
working-directory: apps/web
|
||||
env:
|
||||
VITE_API_URL: /api/v1
|
||||
VITE_DOMAIN: ${{ needs.resolve.outputs.env == 'prod' && 'veza.fr' || 'staging.veza.fr' }}
|
||||
VITE_RELEASE_SHA: ${{ needs.resolve.outputs.sha }}
|
||||
run: npm run build
|
||||
|
||||
- name: Stage tarball contents
|
||||
run: |
|
||||
STAGE="$RUNNER_TEMP/veza-web"
|
||||
mkdir -p "$STAGE"
|
||||
cp -r apps/web/dist/* "$STAGE/"
|
||||
echo "${{ needs.resolve.outputs.sha }}" > "$STAGE/VERSION"
|
||||
|
||||
- name: Pack tarball
|
||||
run: |
|
||||
cd "$RUNNER_TEMP"
|
||||
tar --use-compress-program=zstd -cf \
|
||||
"veza-web-${{ needs.resolve.outputs.sha }}.tar.zst" \
|
||||
-C "$RUNNER_TEMP/veza-web" .
|
||||
|
||||
- name: Push to Forgejo Package Registry
|
||||
env:
|
||||
TOKEN: ${{ secrets.FORGEJO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
set -e
|
||||
TARBALL="veza-web-${{ needs.resolve.outputs.sha }}.tar.zst"
|
||||
URL="${REGISTRY_URL}/veza-web/${{ needs.resolve.outputs.sha }}/${TARBALL}"
|
||||
echo "PUT → $URL"
|
||||
curl -fsSL --fail-with-body -X PUT \
|
||||
-H "Authorization: token ${TOKEN}" \
|
||||
--upload-file "$RUNNER_TEMP/${TARBALL}" \
|
||||
"${URL}"
|
||||
|
||||
# =================================================================
|
||||
# Deploy via Ansible. Runs on the self-hosted runner that has
|
||||
# Incus socket access (label `incus`). Requires Forgejo secrets:
|
||||
# ANSIBLE_VAULT_PASSWORD — unlocks group_vars/all/vault.yml
|
||||
# FORGEJO_REGISTRY_TOKEN — same token the build jobs use,
|
||||
# passed to ansible-playbook so
|
||||
# the data containers can fetch
|
||||
# the tarballs they were just sent.
|
||||
# =================================================================
|
||||
deploy:
|
||||
name: Deploy via Ansible
|
||||
needs: [resolve, build-backend, build-stream, build-web]
|
||||
runs-on: [self-hosted, incus]
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
ref: ${{ needs.resolve.outputs.sha }}
|
||||
|
||||
- name: Install ansible + community.general + community.postgresql + community.rabbitmq
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y ansible python3-psycopg2 python3-pip
|
||||
ansible-galaxy collection install \
|
||||
community.general \
|
||||
community.postgresql \
|
||||
community.rabbitmq
|
||||
|
||||
- name: Write vault password to a tmpfile
|
||||
env:
|
||||
VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
|
||||
run: |
|
||||
printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
|
||||
chmod 0400 "$RUNNER_TEMP/vault-pass"
|
||||
echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run deploy_data.yml (idempotent provisioning + ZFS snapshot)
|
||||
working-directory: infra/ansible
|
||||
env:
|
||||
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-data-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}.log
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
run: |
|
||||
ansible-playbook \
|
||||
-i inventory/${{ needs.resolve.outputs.env }}.yml \
|
||||
playbooks/deploy_data.yml \
|
||||
--vault-password-file "$VAULT_PASS_FILE" \
|
||||
-e veza_env=${{ needs.resolve.outputs.env }} \
|
||||
-e veza_release_sha=${{ needs.resolve.outputs.sha }} \
|
||||
-e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}
|
||||
|
||||
- name: Run deploy_app.yml (blue/green)
|
||||
working-directory: infra/ansible
|
||||
env:
|
||||
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-app-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}.log
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
run: |
|
||||
ansible-playbook \
|
||||
-i inventory/${{ needs.resolve.outputs.env }}.yml \
|
||||
playbooks/deploy_app.yml \
|
||||
--vault-password-file "$VAULT_PASS_FILE" \
|
||||
-e veza_env=${{ needs.resolve.outputs.env }} \
|
||||
-e veza_release_sha=${{ needs.resolve.outputs.sha }} \
|
||||
-e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}
|
||||
|
||||
- name: Upload Ansible logs (for forensics)
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ansible-logs-${{ needs.resolve.outputs.env }}-${{ needs.resolve.outputs.sha }}
|
||||
path: ${{ runner.temp }}/ansible-*.log
|
||||
retention-days: 30
|
||||
|
||||
- name: Shred vault password file
|
||||
if: always()
|
||||
run: |
|
||||
if [ -f "$VAULT_PASS_FILE" ]; then
|
||||
shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
|
||||
fi
|
||||
118
.forgejo/workflows/rollback.yml
Normal file
118
.forgejo/workflows/rollback.yml
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
# rollback.yml — workflow_dispatch only.
|
||||
#
|
||||
# Two modes :
|
||||
# fast — flip HAProxy back to the previous color. ~5s. Requires
|
||||
# the target color's containers to still be alive
|
||||
# (i.e., no later deploy has recycled them).
|
||||
# full — re-run deploy_app.yml with a specific (older) release_sha.
|
||||
# ~5-10min. The artefact must still be in the Forgejo
|
||||
# registry (default retention 30 SHA per component).
|
||||
#
|
||||
# See docs/RUNBOOK_ROLLBACK.md for decision criteria.
|
||||
name: Veza rollback
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
env:
|
||||
description: "Environment to rollback"
|
||||
required: true
|
||||
type: choice
|
||||
options: [staging, prod]
|
||||
mode:
|
||||
description: "Rollback mode"
|
||||
required: true
|
||||
type: choice
|
||||
options: [fast, full]
|
||||
target_color:
|
||||
description: "(mode=fast only) color to flip back TO (the prior active one)"
|
||||
required: false
|
||||
type: choice
|
||||
options: [blue, green]
|
||||
release_sha:
|
||||
description: "(mode=full only) 40-char SHA of the release to redeploy"
|
||||
required: false
|
||||
type: string
|
||||
|
||||
concurrency:
|
||||
group: rollback-${{ inputs.env }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
rollback:
|
||||
name: Rollback ${{ inputs.env }} (${{ inputs.mode }})
|
||||
runs-on: [self-hosted, incus]
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Validate inputs
|
||||
run: |
|
||||
if [ "${{ inputs.mode }}" = "fast" ] && [ -z "${{ inputs.target_color }}" ]; then
|
||||
echo "mode=fast requires target_color"
|
||||
exit 1
|
||||
fi
|
||||
if [ "${{ inputs.mode }}" = "full" ]; then
|
||||
if [ -z "${{ inputs.release_sha }}" ]; then
|
||||
echo "mode=full requires release_sha"
|
||||
exit 1
|
||||
fi
|
||||
if ! echo "${{ inputs.release_sha }}" | grep -Eq '^[0-9a-f]{40}$'; then
|
||||
echo "release_sha is not a 40-char git SHA"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
ref: ${{ inputs.mode == 'full' && inputs.release_sha || github.ref }}
|
||||
|
||||
- name: Install ansible + collections
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y ansible python3-psycopg2
|
||||
ansible-galaxy collection install \
|
||||
community.general \
|
||||
community.postgresql \
|
||||
community.rabbitmq
|
||||
|
||||
- name: Write vault password
|
||||
env:
|
||||
VAULT_PW: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
|
||||
run: |
|
||||
printf '%s' "$VAULT_PW" > "$RUNNER_TEMP/vault-pass"
|
||||
chmod 0400 "$RUNNER_TEMP/vault-pass"
|
||||
echo "VAULT_PASS_FILE=$RUNNER_TEMP/vault-pass" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run rollback.yml
|
||||
working-directory: infra/ansible
|
||||
env:
|
||||
ANSIBLE_LOG_PATH: ${{ runner.temp }}/ansible-rollback-${{ inputs.env }}-${{ inputs.mode }}.log
|
||||
ANSIBLE_HOST_KEY_CHECKING: "False"
|
||||
run: |
|
||||
EXTRA="-e veza_env=${{ inputs.env }} -e mode=${{ inputs.mode }}"
|
||||
if [ "${{ inputs.mode }}" = "fast" ]; then
|
||||
EXTRA="$EXTRA -e target_color=${{ inputs.target_color }}"
|
||||
else
|
||||
EXTRA="$EXTRA -e veza_release_sha=${{ inputs.release_sha }}"
|
||||
EXTRA="$EXTRA -e vault_forgejo_registry_token=${{ secrets.FORGEJO_REGISTRY_TOKEN }}"
|
||||
fi
|
||||
ansible-playbook \
|
||||
-i inventory/${{ inputs.env }}.yml \
|
||||
playbooks/rollback.yml \
|
||||
--vault-password-file "$VAULT_PASS_FILE" \
|
||||
$EXTRA
|
||||
|
||||
- name: Upload Ansible log
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ansible-rollback-${{ inputs.env }}-${{ inputs.mode }}
|
||||
path: ${{ runner.temp }}/ansible-rollback-*.log
|
||||
retention-days: 30
|
||||
|
||||
- name: Shred vault password file
|
||||
if: always()
|
||||
run: |
|
||||
if [ -f "$VAULT_PASS_FILE" ]; then
|
||||
shred -u "$VAULT_PASS_FILE" 2>/dev/null || rm -f "$VAULT_PASS_FILE"
|
||||
fi
|
||||
11
.gitignore
vendored
11
.gitignore
vendored
|
|
@ -265,3 +265,14 @@ frontend_screenshots/
|
|||
|
||||
# Audit_remediation glob (supersedes J2's exact-match json)
|
||||
apps/web/audit_remediation*
|
||||
|
||||
# ============================================================
|
||||
# Ansible Vault — secrets at rest stay encrypted in vault.yml
|
||||
# (committed). The vault password used to unlock them MUST NOT
|
||||
# be committed; the Forgejo runner reads it from a repo secret.
|
||||
# ============================================================
|
||||
infra/ansible/.vault-pass
|
||||
infra/ansible/.vault-pass.*
|
||||
# Local copies devs sometimes drop next to the repo for editing
|
||||
.vault-pass
|
||||
.vault-pass.*
|
||||
|
|
|
|||
111
docs/CANARY_RELEASE.md
Normal file
111
docs/CANARY_RELEASE.md
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
# Canary release — backend-api
|
||||
|
||||
> **Audience** : on-call engineer running a release.
|
||||
> **Trigger** : a new backend-api binary signed-off for prod.
|
||||
> **Owner** : whoever's on the deploy rota that day.
|
||||
|
||||
The canary recipe ships the new binary to **one** backend at a time, watches the SLI for a window, and only continues to the next backend when the SLI stays green. If the SLI breaches at any point, the canary node rolls back automatically to the last-known-good binary.
|
||||
|
||||
## Trigger conditions
|
||||
|
||||
Run the canary script when one of these is true :
|
||||
|
||||
- A normal feature release. New code path, no schema migration that requires lockstep coordination.
|
||||
- A hot-fix on a Sev-2 or below issue. Sev-1 (security or data-integrity) follows the all-stop rotate path documented in `docs/runbooks/INCIDENT_RESPONSE.md` instead.
|
||||
|
||||
## Pre-flight checklist
|
||||
|
||||
- [ ] **Migration backward-compat** : the latest schema migration is additive only — no `DROP COLUMN`, no `ALTER COLUMN ... TYPE`, no `ADD COLUMN ... NOT NULL` without `DEFAULT`. The script's pre-deploy hook (`scripts/check-migration-backward-compat.sh`) refuses to proceed when it finds one ; bypass with `FORCE_MIGRATE=1` only after you've split the migration in your head.
|
||||
- [ ] **Last-known-good binary** is preserved. Either : (a) the previous release's `veza-api` is still on the host at `/opt/veza/backend-api/veza-api.previous`, OR (b) you have it locally and pass `ROLLBACK_BINARY=/path/to/old/veza-api` as env to the script.
|
||||
- [ ] **Prometheus reachable** from the deploy host. The SLI monitor queries `${PROM_URL}` (default `http://prom.lxd:9090`) every `${SLI_PROBE_INTERVAL}` seconds for 1 hour.
|
||||
- [ ] **HAProxy admin socket reachable** : the script execs into the haproxy Incus container to drive `set server ${POOL}/${NODE} state drain|ready` via socat.
|
||||
- [ ] **No game day in the same window.** Canary needs a quiet baseline ; chaos drills will push the SLI red and trigger a false rollback.
|
||||
|
||||
## How
|
||||
|
||||
### One-shot via Make
|
||||
|
||||
```bash
|
||||
make deploy-canary ARTIFACT=/tmp/veza-api-v1.0.10
|
||||
```
|
||||
|
||||
The Make target wraps the script with reasonable defaults. Override any env (see the script header) by exporting before the `make` call.
|
||||
|
||||
### Direct script invocation
|
||||
|
||||
```bash
|
||||
ARTIFACT=/tmp/veza-api-v1.0.10 \
|
||||
ROLLBACK_BINARY=/opt/veza/backend-api/veza-api.previous \
|
||||
SLI_WINDOW=3600 \
|
||||
PROM_URL=http://prom.lxd:9090 \
|
||||
bash scripts/deploy-canary.sh
|
||||
```
|
||||
|
||||
The script is idempotent on the steps that matter : draining an already-drained server is a no-op ; pushing the same binary twice is a no-op (file mtime invariant). Re-runs after a partial failure are safe.
|
||||
|
||||
## What happens, in order
|
||||
|
||||
1. **Pre-deploy hook** runs `scripts/check-migration-backward-compat.sh` on the new-since-`origin/main` migration files. Forbidden patterns abort the deploy.
|
||||
2. **Drain `CANARY_NODE`** (default `backend-api-2`) via the HAProxy admin socket. Wait until the node has 0 active connections.
|
||||
3. **Push the binary** to `/opt/veza/backend-api/veza-api` on the canary container. `systemctl restart veza-backend-api`.
|
||||
4. **Per-node health check** : `curl http://127.0.0.1:8080/api/v1/health` from inside the container. If the node doesn't return 200 within 60 s, rollback.
|
||||
5. **Re-enable** the canary node in HAProxy.
|
||||
6. **LB-side health check** : `curl http://haproxy.lxd${HEALTH_PATH}` returns 200 (proves HAProxy sees the node ready and routes through it).
|
||||
7. **SLI monitor** for `SLI_WINDOW` seconds (default 3600 = 1h). Probes Prometheus every `SLI_PROBE_INTERVAL` (default 30 s) for :
|
||||
- p95 of `veza_gin_http_request_duration_seconds_bucket` < `PROM_P95_THRESHOLD_S` (0.5 s)
|
||||
- error rate (5xx ÷ total) < `PROM_ERR_RATE_THRESHOLD` (0.005 = 0.5%)
|
||||
First red probe → rollback.
|
||||
8. **Roll the peers** : for each `PEER_NODES` entry (default `backend-api-1`), repeat steps 2–6 (drain → deploy → health → re-enable → LB check). The peer roll skips the SLI monitor because the canary already proved the SLI ; if a peer-specific failure happens (binary corrupt on push, container disk full), the script bails out.
|
||||
|
||||
## Rollback path
|
||||
|
||||
The script handles the canary rollback automatically when :
|
||||
|
||||
- The pre-deploy hook fails. Nothing is changed ; nothing to revert.
|
||||
- The canary's health check fails after the deploy. Old binary restored from `ROLLBACK_BINARY`, canary re-enabled.
|
||||
- The SLI breaches during the monitor window. Same as above.
|
||||
|
||||
The script does **NOT** rollback peers automatically — by the time peers are rolling, the canary has already accumulated a green-SLI window. A peer health failure is an artifact of the deploy step (corrupt push, container memory issue), not of the new binary itself, and re-running after fixing the local issue is safer than ping-ponging the binary.
|
||||
|
||||
## Manual rollback (full)
|
||||
|
||||
When the script doesn't catch the regression — say a slow leak that surfaces after the SLI window closes — the on-call manually drives :
|
||||
|
||||
```bash
|
||||
# Find which backend is on the new binary :
|
||||
incus exec backend-api-1 -- ls -la /opt/veza/backend-api/veza-api
|
||||
incus exec backend-api-2 -- ls -la /opt/veza/backend-api/veza-api
|
||||
|
||||
# Rotate both back to the previous binary :
|
||||
for ct in backend-api-1 backend-api-2; do
|
||||
incus exec "$ct" -- mv /opt/veza/backend-api/veza-api.previous /opt/veza/backend-api/veza-api
|
||||
incus exec "$ct" -- systemctl restart veza-backend-api
|
||||
done
|
||||
```
|
||||
|
||||
The previous binary is conventionally kept at `${INSTALL_DIR}/veza-api.previous` ; the canary script does NOT copy the current binary there before overwriting (deliberate — that's a deploy-pipeline responsibility, not a per-canary responsibility).
|
||||
|
||||
## Configuration knobs
|
||||
|
||||
All of these are env vars — the script header is the source of truth for defaults.
|
||||
|
||||
| Knob | Default | When to change |
|
||||
| ----------------------------- | ----------------------------- | ----------------------------------------------------- |
|
||||
| `POOL_BACKEND` | `api_pool` | If you renamed the HAProxy backend |
|
||||
| `CANARY_NODE` | `backend-api-2` | Toggle which node receives the canary first |
|
||||
| `PEER_NODES` | `backend-api-1` | When the fleet grows beyond 2 nodes |
|
||||
| `SLI_WINDOW` | `3600` (1 h) | Shorten for hot-fixes (300 = 5 min minimum) |
|
||||
| `SLI_PROBE_INTERVAL` | `30` s | Tighter probes catch a leak faster but cost Prom load |
|
||||
| `PROM_P95_THRESHOLD_S` | `0.5` | Match the SLO ; loosening it hides regressions |
|
||||
| `PROM_ERR_RATE_THRESHOLD` | `0.005` (0.5 %) | Match the SLO |
|
||||
| `ROLLBACK_BINARY` | (unset) | Always set in a real run — auto-rollback can't work without it |
|
||||
|
||||
## Acceptance bar (Day 23)
|
||||
|
||||
Per `docs/ROADMAP_V1.0_LAUNCH.md` : 3 canary deploys on staging, 2 normal + 1 with a deliberate rollback (e.g. push a binary that hardcodes a 500 on `/api/v1/health`). The rollback exercise verifies the script's auto-revert path actually fires.
|
||||
|
||||
## What this doesn't do
|
||||
|
||||
- **Cross-LB rolls** : single haproxy assumed. When phase-2 adds keepalived + a second LB, the canary script will need a `--lb-set` arg to roll the LB pair too.
|
||||
- **Database migrations** : split-read-write migrations (e.g. dual-write during a rename) need a multi-step deploy that this script doesn't model. For now, only additive migrations are supported through the canary.
|
||||
- **Stream-server canary** : the Rust streamer follows a separate playbook (URI-hash routing means a per-track-id affinity, not a per-session affinity). Same principles apply but the script is backend-api-specific.
|
||||
67
infra/ansible/group_vars/README.md
Normal file
67
infra/ansible/group_vars/README.md
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# `group_vars/` layout
|
||||
|
||||
Three layers, in order of precedence (later wins):
|
||||
|
||||
1. `all/main.yml` — defaults shared across every inventory. Cross-cutting
|
||||
values like SSH hardening, monitoring agent version, and the Veza
|
||||
deploy contract (artifact URL, base image, ports, health probes).
|
||||
2. `<env>.yml` — environment overrides. Today: `staging.yml`, `prod.yml`
|
||||
(and `lab.yml` would live here too if `inventory/lab.yml` ever
|
||||
referenced an `all/lab` group). Targets that pin the Incus host,
|
||||
container prefix, public domain, log level, feature flags.
|
||||
3. `all/vault.yml` — encrypted secrets (Ansible Vault). All entries
|
||||
prefixed `vault_*`. Plaintext template at `all/vault.yml.example`.
|
||||
|
||||
## Bootstrapping the vault
|
||||
|
||||
The vault file is **not** committed at first. To stand it up:
|
||||
|
||||
```bash
|
||||
cd infra/ansible
|
||||
cp group_vars/all/vault.yml.example group_vars/all/vault.yml
|
||||
$EDITOR group_vars/all/vault.yml # fill in <TODO> placeholders
|
||||
ansible-vault encrypt group_vars/all/vault.yml
|
||||
echo "<your strong vault password>" > .vault-pass
|
||||
chmod 0400 .vault-pass
|
||||
```
|
||||
|
||||
`.vault-pass` is gitignored — never commit it. The Forgejo runner
|
||||
gets the same password from the `ANSIBLE_VAULT_PASSWORD` repo secret
|
||||
(see `.forgejo/workflows/deploy.yml`).
|
||||
|
||||
To edit later without decrypting on disk:
|
||||
|
||||
```bash
|
||||
ansible-vault edit group_vars/all/vault.yml
|
||||
```
|
||||
|
||||
To rotate the password (e.g., when an operator leaves):
|
||||
|
||||
```bash
|
||||
ansible-vault rekey group_vars/all/vault.yml
|
||||
echo "<new password>" > .vault-pass
|
||||
# update Forgejo secret ANSIBLE_VAULT_PASSWORD to the new value
|
||||
```
|
||||
|
||||
## How variables flow into containers
|
||||
|
||||
```
|
||||
[Ansible runtime] [Container]
|
||||
group_vars/all/main.yml ┐
|
||||
group_vars/<env>.yml ├──→ roles/veza_app/templates/*.j2 ──→ /etc/veza/<component>.env
|
||||
group_vars/all/vault.yml ┘ ──→ /etc/veza/secrets/jwt-private.pem
|
||||
──→ systemd unit (EnvironmentFile=)
|
||||
```
|
||||
|
||||
The systemd unit then reads `/etc/veza/<component>.env` at start time.
|
||||
Reload semantics: a config change re-templates the env file and
|
||||
notifies the systemd handler, which restarts the unit.
|
||||
|
||||
## What lives in `host_vars/`?
|
||||
|
||||
`host_vars/<host>.yml` for **per-host** overrides — typically when one
|
||||
container in an HA group needs a slightly different config (e.g., the
|
||||
postgres-primary needs `pg_auto_failover_role: node`, the monitor
|
||||
needs `pg_auto_failover_role: monitor`). The lab inventory inlines
|
||||
these as host-level vars; `host_vars/` exists for cases where they
|
||||
shouldn't bloat the inventory file.
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
# Shared defaults across every inventory (lab/staging/prod). Override
|
||||
# per-environment in `group_vars/<group>.yml` or per-host in
|
||||
# `host_vars/<host>.yml`.
|
||||
---
|
||||
# Owner contact (used in some unattended-upgrades + monitoring agent configs).
|
||||
veza_ops_email: ops@veza.fr
|
||||
|
||||
# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
|
||||
# Override these in production via group_vars/veza_prod.yml when the
|
||||
# bastion's specific port / allowed users are decided. Defaults are
|
||||
# safe for lab.
|
||||
ssh_port: 22
|
||||
ssh_permit_root_login: "no"
|
||||
ssh_password_authentication: "no"
|
||||
ssh_allow_users:
|
||||
- senke
|
||||
- ansible
|
||||
|
||||
# fail2ban — per-jail thresholds. The defaults are conservative for
|
||||
# a self-hosted single-machine deployment; production may want
|
||||
# lower findtime / higher bantime once Forgejo + Veza traffic is
|
||||
# baselined.
|
||||
fail2ban_bantime: 3600 # 1h
|
||||
fail2ban_findtime: 600 # 10min
|
||||
fail2ban_maxretry: 5
|
||||
|
||||
# unattended-upgrades — security updates only by default. The role
|
||||
# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
|
||||
# downtime windows to controlled cycles, not OS-driven reboots.
|
||||
unattended_upgrades_origins:
|
||||
- "${distro_id}:${distro_codename}-security"
|
||||
- "${distro_id}ESMApps:${distro_codename}-apps-security"
|
||||
- "${distro_id}ESM:${distro_codename}-infra-security"
|
||||
unattended_upgrades_auto_reboot: false
|
||||
|
||||
# Monitoring agent: prometheus node_exporter is the bare-minimum
|
||||
# host metrics surface (CPU / memory / disk / network). The
|
||||
# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
|
||||
monitoring_node_exporter_version: "1.8.2"
|
||||
monitoring_node_exporter_port: 9100
|
||||
90
infra/ansible/group_vars/all/main.yml
Normal file
90
infra/ansible/group_vars/all/main.yml
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
# Shared defaults across every inventory (lab/staging/prod). Override
|
||||
# per-environment in `group_vars/<group>.yml` or per-host in
|
||||
# `host_vars/<host>.yml`.
|
||||
---
|
||||
# Owner contact (used in some unattended-upgrades + monitoring agent configs).
|
||||
veza_ops_email: ops@veza.fr
|
||||
|
||||
# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
|
||||
# Override these in production via group_vars/veza_prod.yml when the
|
||||
# bastion's specific port / allowed users are decided. Defaults are
|
||||
# safe for lab.
|
||||
ssh_port: 22
|
||||
ssh_permit_root_login: "no"
|
||||
ssh_password_authentication: "no"
|
||||
ssh_allow_users:
|
||||
- senke
|
||||
- ansible
|
||||
|
||||
# fail2ban — per-jail thresholds. The defaults are conservative for
|
||||
# a self-hosted single-machine deployment; production may want
|
||||
# lower findtime / higher bantime once Forgejo + Veza traffic is
|
||||
# baselined.
|
||||
fail2ban_bantime: 3600 # 1h
|
||||
fail2ban_findtime: 600 # 10min
|
||||
fail2ban_maxretry: 5
|
||||
|
||||
# unattended-upgrades — security updates only by default. The role
|
||||
# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
|
||||
# downtime windows to controlled cycles, not OS-driven reboots.
|
||||
unattended_upgrades_origins:
|
||||
- "${distro_id}:${distro_codename}-security"
|
||||
- "${distro_id}ESMApps:${distro_codename}-apps-security"
|
||||
- "${distro_id}ESM:${distro_codename}-infra-security"
|
||||
unattended_upgrades_auto_reboot: false
|
||||
|
||||
# Monitoring agent: prometheus node_exporter is the bare-minimum
|
||||
# host metrics surface (CPU / memory / disk / network). The
|
||||
# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
|
||||
monitoring_node_exporter_version: "1.8.2"
|
||||
monitoring_node_exporter_port: 9100
|
||||
|
||||
# ============================================================
|
||||
# Veza app deploy — defaults shared by every environment.
|
||||
# Each can be overridden in group_vars/{staging,prod}.yml.
|
||||
# ============================================================
|
||||
|
||||
# Forgejo Package Registry where the deploy workflow pushes release
|
||||
# tarballs. Forgejo's generic-package URL shape is:
|
||||
# {base}/{owner}/generic/{package}/{version}/{filename}
|
||||
# We treat each component as a separate package (`veza-backend`,
|
||||
# `veza-stream`, `veza-web`), the SHA as the version, and the
|
||||
# tarball name as the filename. Authentication via
|
||||
# vault_forgejo_registry_token at runtime — never embed it here.
|
||||
veza_artifact_base_url: "https://forgejo.veza.fr/api/packages/talas/generic"
|
||||
|
||||
# Container image used as the base for fresh app containers. The
|
||||
# `veza_app` role apt-installs OS deps on top. Pinned tag keeps deploys
|
||||
# reproducible across base-image updates.
|
||||
veza_app_base_image: "images:debian/13"
|
||||
|
||||
# Per-component HTTP ports. Backend listens on `APP_PORT` env var;
|
||||
# stream listens on `PORT` env var. Templates render these into env
|
||||
# files; HAProxy reads them to wire backends.
|
||||
veza_backend_port: 8080
|
||||
veza_stream_port: 8082
|
||||
veza_web_port: 80
|
||||
|
||||
# Health probe parameters — used by deploy_app's Phase D and by the
|
||||
# rollback playbook when verifying a switched color.
|
||||
veza_healthcheck_retries: 30
|
||||
veza_healthcheck_delay_seconds: 2
|
||||
veza_healthcheck_paths:
|
||||
backend: /api/v1/health
|
||||
stream: /health
|
||||
web: /
|
||||
|
||||
# OS package set installed in every fresh app container. Component-
|
||||
# specific extras live in roles/veza_app/vars/<component>.yml.
|
||||
veza_common_os_packages:
|
||||
- ca-certificates
|
||||
- curl
|
||||
- tzdata
|
||||
- zstd # to decompress release tarballs
|
||||
|
||||
# Where artefacts land in-container. Per-SHA subdirs let multiple
|
||||
# releases coexist for forensics without conflict.
|
||||
veza_install_root: /opt/veza
|
||||
veza_config_root: /etc/veza
|
||||
veza_log_root: /var/log/veza
|
||||
veza_state_root: /var/lib/veza
|
||||
78
infra/ansible/group_vars/all/vault.yml.example
Normal file
78
infra/ansible/group_vars/all/vault.yml.example
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
# Template for group_vars/all/vault.yml — the encrypted secrets store
|
||||
# consumed by every playbook. Copy this file to vault.yml, fill in real
|
||||
# values, then encrypt:
|
||||
#
|
||||
# cp vault.yml.example vault.yml
|
||||
# $EDITOR vault.yml # fill in real values
|
||||
# ansible-vault encrypt vault.yml # in place
|
||||
# echo "<your strong password>" > ../../../.vault-pass # gitignored
|
||||
# chmod 0400 ../../../.vault-pass
|
||||
#
|
||||
# After that, every `ansible-playbook` invocation needs:
|
||||
# ansible-playbook --vault-password-file infra/ansible/.vault-pass ...
|
||||
# The Forgejo deploy workflow handles this via the ANSIBLE_VAULT_PASSWORD
|
||||
# repo secret (see .forgejo/workflows/deploy.yml).
|
||||
#
|
||||
# Naming: every secret is prefixed `vault_*` so it's grep-able and so
|
||||
# `group_vars/all/main.yml` references like `postgres_password:
|
||||
# "{{ vault_postgres_password }}"` are unambiguous.
|
||||
---
|
||||
# --- Database -----------------------------------------------------------
|
||||
vault_postgres_password: "<TODO: 32+ char strong password for veza role>"
|
||||
vault_postgres_replication_password: "<TODO: separate password for replication user>"
|
||||
|
||||
# --- Cache / queue ------------------------------------------------------
|
||||
vault_redis_password: "<TODO>"
|
||||
vault_rabbitmq_password: "<TODO>"
|
||||
|
||||
# --- Object storage (MinIO) ---------------------------------------------
|
||||
vault_minio_root_user: "<TODO: only used to bootstrap the cluster>"
|
||||
vault_minio_root_password: "<TODO: 16+ chars, MinIO refuses shorter>"
|
||||
vault_minio_access_key: "<TODO: app-tier access key>"
|
||||
vault_minio_secret_key: "<TODO: app-tier secret key>"
|
||||
|
||||
# --- JWT ----------------------------------------------------------------
|
||||
# Backend prefers RS256 in prod. Generate with:
|
||||
# openssl genrsa -out jwt-private.pem 4096
|
||||
# openssl rsa -in jwt-private.pem -pubout -out jwt-public.pem
|
||||
# Then base64 each:
|
||||
# base64 -w0 jwt-private.pem
|
||||
# base64 -w0 jwt-public.pem
|
||||
vault_jwt_signing_key_b64: "<TODO: base64 of RS256 private PEM>"
|
||||
vault_jwt_public_key_b64: "<TODO: base64 of RS256 public PEM>"
|
||||
|
||||
# Chat WebSocket signs its own short-lived tokens — must differ from the
|
||||
# main JWT secret in production (defense in depth).
|
||||
vault_chat_jwt_secret: "<TODO: 32+ chars, distinct from JWT signing key>"
|
||||
|
||||
# --- App-internal API keys ---------------------------------------------
|
||||
# Backend ↔ stream-server shared secret. Both services must have the
|
||||
# same value so /api/v1/internal/* requests authenticate.
|
||||
vault_stream_internal_api_key: "<TODO: 32+ chars>"
|
||||
|
||||
# OAuth refresh tokens are encrypted at rest with this key.
|
||||
vault_oauth_encryption_key: "<TODO: exactly 32 bytes, raw or hex>"
|
||||
|
||||
# --- Email --------------------------------------------------------------
|
||||
vault_smtp_password: "<TODO>"
|
||||
|
||||
# --- Payments -----------------------------------------------------------
|
||||
# Hyperswitch routes through Stripe Connect. Both keys are required if
|
||||
# `HYPERSWITCH_ENABLED=true` in group_vars/<env>.yml.
|
||||
vault_hyperswitch_api_key: "<TODO>"
|
||||
vault_hyperswitch_webhook_secret: "<TODO>"
|
||||
vault_stripe_secret_key: "<TODO: sk_live_… in prod, sk_test_… in staging>"
|
||||
|
||||
# --- OAuth providers ----------------------------------------------------
|
||||
# Add only the providers you actually enable; keys consumed by
|
||||
# templates/backend.env.j2 conditionally on truthiness.
|
||||
vault_oauth_clients:
|
||||
google:
|
||||
id: "<TODO>"
|
||||
secret: "<TODO>"
|
||||
spotify:
|
||||
id: "<TODO>"
|
||||
secret: "<TODO>"
|
||||
|
||||
# --- Sentry / observability --------------------------------------------
|
||||
vault_sentry_dsn: "<TODO: empty string disables Sentry>"
|
||||
42
infra/ansible/group_vars/prod.yml
Normal file
42
infra/ansible/group_vars/prod.yml
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# Prod-specific overrides. Same R720 host as staging in v1.0; separate
|
||||
# Incus network + container prefix prevents staging/prod from sharing
|
||||
# any state. Phase-2 (post v1.1) is expected to move prod to a
|
||||
# dedicated host, at which point only `veza_incus_host` flips.
|
||||
---
|
||||
veza_env: prod
|
||||
veza_release_channel: prod
|
||||
|
||||
veza_incus_host: veza-prod
|
||||
veza_incus_network: veza-net
|
||||
veza_incus_subnet: 10.0.20.0/24
|
||||
|
||||
veza_container_prefix: "veza-" # production uses unprefixed names — the established convention
|
||||
|
||||
veza_incus_dns_suffix: lxd
|
||||
|
||||
haproxy_topology: blue-green
|
||||
|
||||
veza_public_host: veza.fr
|
||||
veza_public_url: "https://veza.fr"
|
||||
veza_cors_allowed_origins:
|
||||
- "https://veza.fr"
|
||||
- "https://app.veza.fr"
|
||||
|
||||
# Prod is INFO so 99th-percentile log volume stays manageable. Bump to
|
||||
# DEBUG for a window via `ansible-playbook -e veza_log_level=DEBUG` if
|
||||
# triaging an incident.
|
||||
veza_log_level: INFO
|
||||
veza_otel_sample_rate: "0.05"
|
||||
|
||||
veza_feature_flags:
|
||||
HYPERSWITCH_ENABLED: "true"
|
||||
STRIPE_CONNECT_ENABLED: "true"
|
||||
WEBAUTHN_ENABLED: "true"
|
||||
|
||||
# Larger retention than staging — prod rollback may need to reach a
|
||||
# release from up to a month ago when the cause was latent.
|
||||
veza_release_retention: 60
|
||||
|
||||
postgres_password: "{{ vault_postgres_password }}"
|
||||
redis_password: "{{ vault_redis_password }}"
|
||||
rabbitmq_password: "{{ vault_rabbitmq_password }}"
|
||||
67
infra/ansible/group_vars/staging.yml
Normal file
67
infra/ansible/group_vars/staging.yml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# Staging-specific overrides. Targets the local R720 Incus daemon (the
|
||||
# same host the Forgejo runner lives on). Containers prefixed `veza-*`
|
||||
# share the `veza-net` Incus bridge (10.0.20.0/24).
|
||||
#
|
||||
# Phase-1 simplification: staging and prod coexist on the same R720 but
|
||||
# on separate Incus networks (veza-staging-net 10.0.21.0/24 vs
|
||||
# veza-prod-net 10.0.20.0/24) and separate container name prefixes
|
||||
# (veza-staging-* vs veza-prod-*). When prod migrates off-box (Hetzner
|
||||
# or similar), this file's `veza_incus_host` flips to that target.
|
||||
---
|
||||
veza_env: staging
|
||||
veza_release_channel: staging
|
||||
|
||||
# Where the Incus daemon lives. Used by the deploy workflow to decide
|
||||
# which inventory host's `community.general.incus` connection plugin
|
||||
# to drive containers from.
|
||||
veza_incus_host: veza-staging
|
||||
veza_incus_network: veza-staging-net
|
||||
veza_incus_subnet: 10.0.21.0/24
|
||||
|
||||
# Container name prefix — every app/data container ends up named
|
||||
# `<veza_container_prefix><component>[-<color>]`. e.g.
|
||||
# veza-staging-backend-blue, veza-staging-postgres.
|
||||
veza_container_prefix: "veza-staging-"
|
||||
|
||||
# DNS suffix Incus assigns to managed containers. The HAProxy template
|
||||
# resolves backends as `<container>.<suffix>`. Default `.lxd` works
|
||||
# with the stock Incus DNS resolver; override if you've renamed the
|
||||
# managed network's DNS zone.
|
||||
veza_incus_dns_suffix: lxd
|
||||
|
||||
# HAProxy strategy for the staging stack: blue/green, two app
|
||||
# containers per component (active + standby). Differs from the lab
|
||||
# inventory which uses an active/active multi-instance pattern.
|
||||
haproxy_topology: blue-green
|
||||
|
||||
# Public-facing URLs — used by backend for OAuth redirects, email
|
||||
# links, CSP origins, and by HAProxy ACLs.
|
||||
veza_public_host: staging.veza.fr
|
||||
veza_public_url: "https://staging.veza.fr"
|
||||
veza_cors_allowed_origins:
|
||||
- "https://staging.veza.fr"
|
||||
- "https://staging-app.veza.fr"
|
||||
|
||||
# Logging — staging keeps DEBUG to make incident triage easy. Prod
|
||||
# drops to INFO. Tracing sample rate stays at 100% in staging
|
||||
# (low traffic) and 5% in prod (cost).
|
||||
veza_log_level: DEBUG
|
||||
veza_otel_sample_rate: "1.0"
|
||||
|
||||
# Feature flags exposed to the backend at boot. Keep this list small —
|
||||
# the backend's own .env.template is the canonical reference.
|
||||
veza_feature_flags:
|
||||
HYPERSWITCH_ENABLED: "false"
|
||||
STRIPE_CONNECT_ENABLED: "false"
|
||||
WEBAUTHN_ENABLED: "true"
|
||||
|
||||
# How many recent release SHAs the rollback workflow can target. Older
|
||||
# tarballs are pruned by the Forgejo registry retention policy (set
|
||||
# externally). 30 deploys ≈ a working week given the staging cadence.
|
||||
veza_release_retention: 30
|
||||
|
||||
# Postgres password the migrations job uses — references vault.yml so
|
||||
# rotation is one ansible-vault edit + one redeploy.
|
||||
postgres_password: "{{ vault_postgres_password }}"
|
||||
redis_password: "{{ vault_redis_password }}"
|
||||
rabbitmq_password: "{{ vault_rabbitmq_password }}"
|
||||
83
infra/ansible/playbooks/cleanup_failed.yml
Normal file
83
infra/ansible/playbooks/cleanup_failed.yml
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
# cleanup_failed.yml — destroy the app containers of a specific color.
|
||||
# Used when a deploy_app.yml run failed Phase D or Phase F and the
|
||||
# operator has finished forensics on the kept-alive failed color.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env staging | prod
|
||||
# target_color blue | green (the color to tear down)
|
||||
#
|
||||
# Safety: refuses to destroy the CURRENTLY-ACTIVE color. Active color
|
||||
# is read from the HAProxy container's /var/lib/veza/active-color.
|
||||
#
|
||||
# Caller (workflow_dispatch only):
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/cleanup_failed.yml \
|
||||
# -e env={{env}} -e target_color={{color}}
|
||||
---
|
||||
- name: Validate inputs and refuse to nuke the active color
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Assert required vars
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env is defined
|
||||
- veza_env in ['staging', 'prod']
|
||||
- target_color is defined
|
||||
- target_color in ['blue', 'green']
|
||||
fail_msg: cleanup_failed.yml requires veza_env + target_color.
|
||||
quiet: true
|
||||
|
||||
- name: Read active color from HAProxy container
|
||||
ansible.builtin.shell: |
|
||||
incus exec "{{ veza_container_prefix }}haproxy" -- \
|
||||
cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]'
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: active_color_raw
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Resolve current_active_color
|
||||
ansible.builtin.set_fact:
|
||||
current_active_color: "{{ active_color_raw.stdout if active_color_raw.stdout else 'blue' }}"
|
||||
|
||||
- name: Refuse if target_color matches the active color
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
target_color={{ target_color }} matches the currently-active
|
||||
color in HAProxy. Refusing to destroy live containers.
|
||||
Switch HAProxy first via rollback.yml or a re-deploy.
|
||||
when: target_color == current_active_color
|
||||
|
||||
- name: Destroy the inactive-color app containers
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Force-delete each component container
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
|
||||
if incus info "$CT" >/dev/null 2>&1; then
|
||||
incus delete --force "$CT"
|
||||
echo "Destroyed $CT"
|
||||
else
|
||||
echo "$CT does not exist, skip"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop:
|
||||
- backend
|
||||
- stream
|
||||
- web
|
||||
register: cleanup_result
|
||||
changed_when: "'Destroyed' in (cleanup_result.stdout | default(''))"
|
||||
tags: [cleanup]
|
||||
|
||||
- name: Report what was destroyed
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Cleanup of color {{ target_color }} in env {{ veza_env }} complete.
|
||||
Active color unchanged: {{ current_active_color }}.
|
||||
Next deploy will recreate {{ target_color }} containers from scratch.
|
||||
355
infra/ansible/playbooks/deploy_app.yml
Normal file
355
infra/ansible/playbooks/deploy_app.yml
Normal file
|
|
@ -0,0 +1,355 @@
|
|||
# deploy_app.yml — second-half of every deploy. Runs AFTER
|
||||
# deploy_data.yml has snapshot + ensured data services up.
|
||||
#
|
||||
# Phases (mirror docs/RUNBOOK_ROLLBACK.md):
|
||||
# A — Run migrations in an ephemeral tools container.
|
||||
# B — Read /var/lib/veza/active-color in the HAProxy container,
|
||||
# compute inactive_color (the color we are deploying TO).
|
||||
# C — Destroy + relaunch the three app containers in inactive_color.
|
||||
# Apply roles/veza_app per component (artefact install + health
|
||||
# probe).
|
||||
# D — Implicit in C: veza_app role's probe.yml runs. If any color's
|
||||
# probe fails, the playbook errors and Phase E is skipped (HAProxy
|
||||
# still pointing at the prior active color).
|
||||
# E — Switch HAProxy via roles/veza_haproxy_switch (block/rescue
|
||||
# guards prior cfg).
|
||||
# F — External verification : curl through HAProxy, fail the playbook
|
||||
# (and reverse-switch) if the public health endpoint is < 200.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env staging | prod
|
||||
# release_sha 40-char git SHA
|
||||
---
|
||||
# =====================================================================
|
||||
# Phase A — Migrations
|
||||
# =====================================================================
|
||||
- name: Phase A — apply database migrations
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
tasks:
|
||||
- name: Validate inputs
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env in ['staging', 'prod']
|
||||
- veza_release_sha | length == 40
|
||||
fail_msg: deploy_app.yml requires veza_env + veza_release_sha extra-vars.
|
||||
quiet: true
|
||||
|
||||
- name: Ensure ephemeral tools container exists
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
TOOLS="{{ veza_container_prefix }}backend-tools"
|
||||
if ! incus info "$TOOLS" >/dev/null 2>&1; then
|
||||
incus launch {{ veza_app_base_image }} "$TOOLS" \
|
||||
--profile veza-app --profile veza-net \
|
||||
--network "{{ veza_incus_network }}"
|
||||
for i in $(seq 1 30); do
|
||||
incus exec "$TOOLS" -- /bin/true 2>/dev/null && exit 0
|
||||
sleep 1
|
||||
done
|
||||
echo "tools container did not become ready"
|
||||
exit 1
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: tools_provision
|
||||
changed_when: "'incus launch' in (tools_provision.stdout | default(''))"
|
||||
tags: [phaseA, migrations]
|
||||
|
||||
- name: Refresh inventory so the tools container becomes reachable
|
||||
ansible.builtin.meta: refresh_inventory
|
||||
tags: [phaseA]
|
||||
|
||||
- name: Phase A — install backend artifact + run migrate_tool inside tools
|
||||
hosts: "{{ veza_container_prefix + 'backend-tools' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: backend
|
||||
veza_target_color: tools # not blue/green — bypass color logic in name
|
||||
tasks:
|
||||
- name: Apt deps for tools container
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- ca-certificates
|
||||
- curl
|
||||
- postgresql-client
|
||||
- libssl3
|
||||
- zstd
|
||||
state: present
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: Ensure migrate user
|
||||
ansible.builtin.user:
|
||||
name: veza-migrate
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
|
||||
- name: Ensure /opt/veza/migrate
|
||||
ansible.builtin.file:
|
||||
path: /opt/veza/migrate
|
||||
state: directory
|
||||
owner: veza-migrate
|
||||
mode: "0755"
|
||||
|
||||
- name: Fetch backend tarball
|
||||
ansible.builtin.get_url:
|
||||
url: "{{ veza_artifact_base_url }}/veza-backend/{{ veza_release_sha }}/veza-backend-{{ veza_release_sha }}.tar.zst"
|
||||
dest: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
|
||||
mode: "0600"
|
||||
headers:
|
||||
Authorization: "token {{ vault_forgejo_registry_token | default('') }}"
|
||||
force: false
|
||||
|
||||
- name: Extract tarball into /opt/veza/migrate
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
|
||||
dest: "/opt/veza/migrate"
|
||||
remote_src: true
|
||||
owner: veza-migrate
|
||||
creates: "/opt/veza/migrate/migrate_tool"
|
||||
|
||||
- name: Run migrate_tool
|
||||
ansible.builtin.command: /opt/veza/migrate/migrate_tool --up
|
||||
environment:
|
||||
DATABASE_URL: "postgres://veza:{{ vault_postgres_password }}@{{ veza_container_prefix }}postgres.{{ veza_incus_dns_suffix }}:5432/veza?sslmode=disable"
|
||||
register: migrate_result
|
||||
changed_when: "'no changes' not in (migrate_result.stdout | default('').lower())"
|
||||
no_log: true # DATABASE_URL contains the password
|
||||
tags: [phaseA, migrations]
|
||||
|
||||
# =====================================================================
|
||||
# Phase B — Determine inactive color
|
||||
# =====================================================================
|
||||
- name: Phase B — read active color, compute inactive_color
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Read currently-active color
|
||||
ansible.builtin.slurp:
|
||||
src: /var/lib/veza/active-color
|
||||
register: prior_color_raw
|
||||
failed_when: false
|
||||
|
||||
- name: Resolve prior_active_color (default blue if no history)
|
||||
ansible.builtin.set_fact:
|
||||
prior_active_color: >-
|
||||
{{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
|
||||
else 'blue' }}
|
||||
cacheable: true
|
||||
|
||||
- name: Compute inactive_color (the one we deploy TO)
|
||||
ansible.builtin.set_fact:
|
||||
inactive_color: "{{ 'green' if prior_active_color == 'blue' else 'blue' }}"
|
||||
cacheable: true
|
||||
|
||||
- name: Show what we are switching to
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
Deploying SHA {{ veza_release_sha[:12] }} to color
|
||||
{{ inactive_color }} (currently active: {{ prior_active_color }}).
|
||||
|
||||
# =====================================================================
|
||||
# Phase C — destroy + relaunch the three app containers in inactive_color
|
||||
# =====================================================================
|
||||
- name: Phase C — recreate inactive-color app containers (host-side)
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
tasks:
|
||||
- name: Destroy + launch each component container
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CT="{{ veza_container_prefix }}{{ item }}-{{ inactive_color }}"
|
||||
# Force-delete is fine — these are stateless app containers ; the
|
||||
# active color is untouched.
|
||||
incus delete --force "$CT" 2>/dev/null || true
|
||||
incus launch {{ veza_app_base_image }} "$CT" \
|
||||
--profile veza-app \
|
||||
--profile veza-net \
|
||||
--network "{{ veza_incus_network }}"
|
||||
for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
|
||||
if incus exec "$CT" -- /bin/true 2>/dev/null; then
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Container $CT did not become ready"
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop:
|
||||
- backend
|
||||
- stream
|
||||
- web
|
||||
changed_when: true
|
||||
tags: [phaseC]
|
||||
|
||||
- name: Refresh inventory so freshly-launched containers become reachable
|
||||
ansible.builtin.meta: refresh_inventory
|
||||
tags: [phaseC]
|
||||
|
||||
- name: Phase C — provision backend (inactive color) via veza_app role
|
||||
hosts: "{{ veza_container_prefix + 'backend-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: backend
|
||||
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
roles:
|
||||
- veza_app
|
||||
tags: [phaseC, backend]
|
||||
|
||||
- name: Phase C — provision stream (inactive color)
|
||||
hosts: "{{ veza_container_prefix + 'stream-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: stream
|
||||
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
roles:
|
||||
- veza_app
|
||||
tags: [phaseC, stream]
|
||||
|
||||
- name: Phase C — provision web (inactive color)
|
||||
hosts: "{{ veza_container_prefix + 'web-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: web
|
||||
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
roles:
|
||||
- veza_app
|
||||
tags: [phaseC, web]
|
||||
|
||||
# =====================================================================
|
||||
# Phase D — cross-container probes (in addition to in-container probes
|
||||
# that veza_app already ran). This catches the case where the service
|
||||
# is up locally but unreachable via Incus DNS.
|
||||
# =====================================================================
|
||||
- name: Phase D — probe each component via Incus DNS (cross-container)
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Curl each component's health endpoint
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ veza_container_prefix }}{{ item.component }}-{{ inactive_color }}.{{ veza_incus_dns_suffix }}:{{ item.port }}{{ item.path }}"
|
||||
method: GET
|
||||
status_code: [200]
|
||||
timeout: 5
|
||||
register: cross_probe
|
||||
retries: "{{ veza_healthcheck_retries }}"
|
||||
delay: "{{ veza_healthcheck_delay_seconds }}"
|
||||
until: cross_probe.status == 200
|
||||
changed_when: false
|
||||
loop:
|
||||
- { component: backend, port: "{{ veza_backend_port }}", path: "{{ veza_healthcheck_paths.backend }}" }
|
||||
- { component: stream, port: "{{ veza_stream_port }}", path: "{{ veza_healthcheck_paths.stream }}" }
|
||||
- { component: web, port: "{{ veza_web_port }}", path: "{{ veza_healthcheck_paths.web }}" }
|
||||
tags: [phaseD, probe]
|
||||
|
||||
# =====================================================================
|
||||
# Phase E — switch HAProxy. roles/veza_haproxy_switch wraps render +
|
||||
# validate + atomic-swap + HUP in a block/rescue that restores prior
|
||||
# cfg on failure.
|
||||
# =====================================================================
|
||||
- name: Phase E — switch HAProxy to the new color
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: true # roles/veza_haproxy_switch wants ansible_date_time
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_active_color: "{{ inactive_color }}" # the color we ARE switching TO
|
||||
roles:
|
||||
- veza_haproxy_switch
|
||||
tags: [phaseE, switch]
|
||||
|
||||
# =====================================================================
|
||||
# Phase F — Post-deploy verification (external curl through HAProxy).
|
||||
# If this fails, we revert HAProxy to the prior color via a second run
|
||||
# of veza_haproxy_switch and fail the playbook.
|
||||
# =====================================================================
|
||||
- name: Phase F — verify externally + record deploy state
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
prior_active_color: "{{ hostvars[veza_container_prefix + 'haproxy']['prior_active_color'] }}"
|
||||
tasks:
|
||||
- name: Curl public health endpoint via HAProxy
|
||||
ansible.builtin.uri:
|
||||
url: "{{ veza_public_url }}/api/v1/health"
|
||||
method: GET
|
||||
status_code: [200]
|
||||
timeout: 10
|
||||
validate_certs: "{{ veza_public_url.startswith('https://') }}"
|
||||
register: public_health
|
||||
retries: 10
|
||||
delay: 3
|
||||
until: public_health.status == 200
|
||||
tags: [phaseF, verify]
|
||||
|
||||
- name: Write deploy-state.json (consumed by node-exporter textfile)
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
|
||||
content: |
|
||||
# HELP veza_deploy_active_color 0=blue, 1=green.
|
||||
# TYPE veza_deploy_active_color gauge
|
||||
veza_deploy_active_color{env="{{ veza_env }}"} {{ 0 if inactive_color == 'blue' else 1 }}
|
||||
# HELP veza_deploy_release_sha info metric, label=sha.
|
||||
# TYPE veza_deploy_release_sha gauge
|
||||
veza_deploy_release_sha{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} 1
|
||||
# HELP veza_deploy_last_success_timestamp unix epoch of last successful deploy.
|
||||
# TYPE veza_deploy_last_success_timestamp gauge
|
||||
veza_deploy_last_success_timestamp{env="{{ veza_env }}"} {{ ansible_date_time.epoch }}
|
||||
mode: "0644"
|
||||
tags: [phaseF, metrics]
|
||||
rescue:
|
||||
- name: Public health failed — record the failure timestamp
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
|
||||
content: |
|
||||
# HELP veza_deploy_last_failure_timestamp unix epoch of last failed deploy.
|
||||
# TYPE veza_deploy_last_failure_timestamp gauge
|
||||
veza_deploy_last_failure_timestamp{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} {{ ansible_date_time.epoch }}
|
||||
mode: "0644"
|
||||
failed_when: false
|
||||
|
||||
- name: Re-switch HAProxy back to the prior color
|
||||
ansible.builtin.import_role:
|
||||
name: veza_haproxy_switch
|
||||
vars:
|
||||
veza_active_color: "{{ prior_active_color }}"
|
||||
delegate_to: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
|
||||
- name: Fail the playbook
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
Public health probe via HAProxy failed after deploy of SHA
|
||||
{{ veza_release_sha[:12] }} to color {{ inactive_color }}.
|
||||
HAProxy reverted to the prior color ({{ prior_active_color }}).
|
||||
The freshly-deployed {{ inactive_color }} containers are kept
|
||||
alive for forensics — inspect with:
|
||||
incus exec {{ veza_container_prefix }}backend-{{ inactive_color }} -- journalctl -u veza-backend -n 200
|
||||
411
infra/ansible/playbooks/deploy_data.yml
Normal file
411
infra/ansible/playbooks/deploy_data.yml
Normal file
|
|
@ -0,0 +1,411 @@
|
|||
# deploy_data.yml — idempotent data-tier provisioning. Runs FIRST in
|
||||
# every deploy. Three principles:
|
||||
#
|
||||
# 1. ZFS-snapshot every data container's dataset before doing
|
||||
# anything else. The snapshot is the safety net for any later
|
||||
# mistake in the same run.
|
||||
# 2. Containers are created if absent, never destroyed. Volumes
|
||||
# survive every deploy.
|
||||
# 3. Service config drift is reconciled, but state-bearing things
|
||||
# (data dirs, schema, MinIO buckets) are reload-not-restart
|
||||
# where the daemon supports it.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env one of staging|prod (selects inventory group_vars)
|
||||
# release_sha git SHA of the release (snapshot label)
|
||||
#
|
||||
# Caller pattern in .forgejo/workflows/deploy.yml:
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/deploy_data.yml \
|
||||
# -e env={{env}} -e release_sha={{sha}}
|
||||
---
|
||||
- name: Pre-flight — validate inputs and resolve runtime context
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
tasks:
|
||||
- name: Assert required vars are set
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env is defined
|
||||
- veza_env in ['staging', 'prod']
|
||||
- veza_release_sha is defined
|
||||
- veza_release_sha | length == 40
|
||||
fail_msg: >-
|
||||
deploy_data.yml requires veza_env (staging|prod) +
|
||||
veza_release_sha (40-char SHA). Pass via -e on the
|
||||
command line or via inventory group_vars.
|
||||
|
||||
- name: Compute the list of data containers we manage
|
||||
ansible.builtin.set_fact:
|
||||
veza_data_containers:
|
||||
- name: "{{ veza_container_prefix }}postgres"
|
||||
kind: postgres
|
||||
- name: "{{ veza_container_prefix }}redis"
|
||||
kind: redis
|
||||
- name: "{{ veza_container_prefix }}rabbitmq"
|
||||
kind: rabbitmq
|
||||
- name: "{{ veza_container_prefix }}minio"
|
||||
kind: minio
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# ZFS snapshot before mutation. A failed prune is logged but not fatal —
|
||||
# safer to lose disk to retained snapshots than to skip the snapshot.
|
||||
# -----------------------------------------------------------------------
|
||||
- name: ZFS-snapshot every data container's dataset
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Snapshot per-container dataset
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
# Best-effort dataset path resolution from `incus storage volume show`.
|
||||
# If the container doesn't exist yet (first-ever deploy), skip — there's
|
||||
# nothing to snapshot.
|
||||
if ! incus info "{{ item.name }}" >/dev/null 2>&1; then
|
||||
echo "Container {{ item.name }} does not yet exist, skip snapshot"
|
||||
exit 0
|
||||
fi
|
||||
DATASET=$(zfs list -H -o name | grep -E "containers/{{ item.name }}$" | head -1 || true)
|
||||
if [ -z "$DATASET" ]; then
|
||||
echo "No ZFS dataset for {{ item.name }} — likely non-ZFS storage, skip"
|
||||
exit 0
|
||||
fi
|
||||
SNAP_NAME="${DATASET}@pre-deploy-{{ veza_release_sha }}"
|
||||
if zfs list -H -t snapshot "$SNAP_NAME" >/dev/null 2>&1; then
|
||||
echo "Snapshot $SNAP_NAME already exists (idempotent rerun)"
|
||||
exit 0
|
||||
fi
|
||||
zfs snapshot "$SNAP_NAME"
|
||||
echo "Created $SNAP_NAME"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop: "{{ veza_data_containers }}"
|
||||
register: snap_result
|
||||
changed_when: "'Created' in (snap_result.stdout | default(''))"
|
||||
tags: [data, zfs, snapshot]
|
||||
|
||||
- name: Prune ZFS snapshots beyond retention window
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
# Keep the {{ veza_release_retention | default(30) }} most-recent
|
||||
# pre-deploy snapshots per dataset ; delete the rest.
|
||||
for dataset in $(zfs list -H -o name | grep -E "containers/{{ veza_container_prefix }}(postgres|redis|rabbitmq|minio)$"); do
|
||||
zfs list -H -t snapshot -o name -s creation "$dataset" \
|
||||
| grep "@pre-deploy-" \
|
||||
| head -n -{{ veza_release_retention | default(30) }} \
|
||||
| xargs -r -n1 zfs destroy -r || true
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
tags: [data, zfs, prune]
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Provision (create-if-absent) each data container. We don't recreate
|
||||
# existing ones — they own state.
|
||||
# -----------------------------------------------------------------------
|
||||
- name: Ensure data containers exist
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Launch container if absent
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
if incus info "{{ item.name }}" >/dev/null 2>&1; then
|
||||
echo "{{ item.name }} already exists"
|
||||
exit 0
|
||||
fi
|
||||
incus launch {{ veza_app_base_image }} "{{ item.name }}" \
|
||||
--profile veza-data \
|
||||
--profile veza-net \
|
||||
--network "{{ veza_incus_network }}"
|
||||
# Wait for the container's API to respond before any subsequent task
|
||||
# (apt, systemd) hits a half-up container.
|
||||
for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
|
||||
if incus exec "{{ item.name }}" -- /bin/true 2>/dev/null; then
|
||||
echo "Container {{ item.name }} ready"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Container {{ item.name }} did not become ready within timeout"
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop: "{{ veza_data_containers }}"
|
||||
register: launch_result
|
||||
changed_when: "'Container' in (launch_result.stdout | default('')) and 'ready' in (launch_result.stdout | default(''))"
|
||||
tags: [data, provision]
|
||||
|
||||
- name: Refresh inventory so the new containers become reachable
|
||||
ansible.builtin.meta: refresh_inventory
|
||||
tags: [data, provision]
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Per-kind service config. Implemented inline rather than via roles so
|
||||
# this playbook stays readable. When a kind grows, lift it into its own
|
||||
# tasks/<kind>.yml or role.
|
||||
# -----------------------------------------------------------------------
|
||||
- name: Configure postgres
|
||||
hosts: "{{ veza_container_prefix + 'postgres' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Install postgresql-16
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- postgresql-16
|
||||
- python3-psycopg2 # Required by Ansible's postgresql_user/db modules
|
||||
state: present
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: Ensure postgres is enabled + started
|
||||
ansible.builtin.systemd:
|
||||
name: postgresql
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Wait for postgres ready
|
||||
ansible.builtin.wait_for:
|
||||
port: 5432
|
||||
host: 127.0.0.1
|
||||
timeout: 30
|
||||
|
||||
- name: Ensure veza role exists with the vault-stored password
|
||||
community.postgresql.postgresql_user:
|
||||
name: veza
|
||||
password: "{{ vault_postgres_password }}"
|
||||
role_attr_flags: LOGIN
|
||||
become_user: postgres
|
||||
no_log: true
|
||||
|
||||
- name: Ensure veza database exists owned by veza role
|
||||
community.postgresql.postgresql_db:
|
||||
name: veza
|
||||
owner: veza
|
||||
encoding: UTF8
|
||||
lc_collate: C
|
||||
lc_ctype: C
|
||||
template: template0
|
||||
become_user: postgres
|
||||
tags: [data, postgres]
|
||||
|
||||
- name: Configure redis
|
||||
hosts: "{{ veza_container_prefix + 'redis' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Install redis-server
|
||||
ansible.builtin.apt:
|
||||
name: redis-server
|
||||
state: present
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: Render redis.conf with password
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
bind 0.0.0.0
|
||||
protected-mode yes
|
||||
port 6379
|
||||
requirepass {{ vault_redis_password }}
|
||||
maxmemory 256mb
|
||||
maxmemory-policy allkeys-lru
|
||||
appendonly yes
|
||||
appendfsync everysec
|
||||
dir /var/lib/redis
|
||||
dest: /etc/redis/redis.conf
|
||||
owner: redis
|
||||
group: redis
|
||||
mode: "0640"
|
||||
no_log: true
|
||||
notify: Restart redis
|
||||
|
||||
- name: Ensure redis is enabled + started
|
||||
ansible.builtin.systemd:
|
||||
name: redis-server
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Wait for redis ready
|
||||
ansible.builtin.wait_for:
|
||||
port: 6379
|
||||
host: 127.0.0.1
|
||||
timeout: 30
|
||||
handlers:
|
||||
- name: Restart redis
|
||||
ansible.builtin.systemd:
|
||||
name: redis-server
|
||||
state: restarted
|
||||
tags: [data, redis]
|
||||
|
||||
- name: Configure rabbitmq
|
||||
hosts: "{{ veza_container_prefix + 'rabbitmq' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Install rabbitmq-server
|
||||
ansible.builtin.apt:
|
||||
name: rabbitmq-server
|
||||
state: present
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: Ensure rabbitmq is enabled + started
|
||||
ansible.builtin.systemd:
|
||||
name: rabbitmq-server
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Wait for rabbitmq ready
|
||||
ansible.builtin.wait_for:
|
||||
port: 5672
|
||||
host: 127.0.0.1
|
||||
timeout: 60
|
||||
|
||||
- name: Ensure /veza vhost exists
|
||||
community.rabbitmq.rabbitmq_vhost:
|
||||
name: /veza
|
||||
state: present
|
||||
|
||||
- name: Ensure veza user exists with vault password
|
||||
community.rabbitmq.rabbitmq_user:
|
||||
user: veza
|
||||
password: "{{ vault_rabbitmq_password }}"
|
||||
vhost: /veza
|
||||
configure_priv: ".*"
|
||||
read_priv: ".*"
|
||||
write_priv: ".*"
|
||||
state: present
|
||||
update_password: always
|
||||
no_log: true
|
||||
tags: [data, rabbitmq]
|
||||
|
||||
- name: Configure minio
|
||||
hosts: "{{ veza_container_prefix + 'minio' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Install MinIO via apt (or fallback to direct download)
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
if ! command -v minio >/dev/null 2>&1; then
|
||||
curl -fsSL https://dl.min.io/server/minio/release/linux-amd64/minio -o /usr/local/bin/minio
|
||||
chmod 0755 /usr/local/bin/minio
|
||||
fi
|
||||
if ! command -v mc >/dev/null 2>&1; then
|
||||
curl -fsSL https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/local/bin/mc
|
||||
chmod 0755 /usr/local/bin/mc
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
- name: Ensure minio system user
|
||||
ansible.builtin.user:
|
||||
name: minio
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
home: /var/lib/minio
|
||||
|
||||
- name: Ensure minio data dir
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/minio
|
||||
state: directory
|
||||
owner: minio
|
||||
group: minio
|
||||
mode: "0750"
|
||||
|
||||
- name: Render minio EnvironmentFile
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
MINIO_ROOT_USER={{ vault_minio_root_user }}
|
||||
MINIO_ROOT_PASSWORD={{ vault_minio_root_password }}
|
||||
MINIO_VOLUMES=/var/lib/minio
|
||||
MINIO_OPTS="--address :9000 --console-address :9001"
|
||||
dest: /etc/default/minio
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0640"
|
||||
no_log: true
|
||||
notify: Restart minio
|
||||
|
||||
- name: Render minio systemd unit
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=MinIO
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=minio
|
||||
Group=minio
|
||||
EnvironmentFile=/etc/default/minio
|
||||
ExecStart=/usr/local/bin/minio server $MINIO_OPTS $MINIO_VOLUMES
|
||||
Restart=on-failure
|
||||
LimitNOFILE=65535
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
dest: /etc/systemd/system/minio.service
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Reload systemd
|
||||
- Restart minio
|
||||
|
||||
- name: Enable + start minio
|
||||
ansible.builtin.systemd:
|
||||
name: minio
|
||||
state: started
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
|
||||
- name: Wait for minio ready
|
||||
ansible.builtin.wait_for:
|
||||
port: 9000
|
||||
host: 127.0.0.1
|
||||
timeout: 60
|
||||
|
||||
- name: Configure mc client alias
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
mc alias set veza-local http://127.0.0.1:9000 \
|
||||
"{{ vault_minio_root_user }}" "{{ vault_minio_root_password }}" >/dev/null
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
no_log: true
|
||||
|
||||
- name: Ensure veza-{{ veza_env }} bucket exists
|
||||
ansible.builtin.shell: |
|
||||
mc mb --ignore-existing veza-local/veza-{{ veza_env }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
handlers:
|
||||
- name: Reload systemd
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: true
|
||||
- name: Restart minio
|
||||
ansible.builtin.systemd:
|
||||
name: minio
|
||||
state: restarted
|
||||
tags: [data, minio]
|
||||
113
infra/ansible/playbooks/rollback.yml
Normal file
113
infra/ansible/playbooks/rollback.yml
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
# rollback.yml — two modes :
|
||||
#
|
||||
# 1. fast : flip HAProxy back to the previous active color.
|
||||
# Works only if those containers are still alive
|
||||
# (i.e., the next deploy has NOT yet recycled them).
|
||||
# Effect time : ~5 seconds.
|
||||
#
|
||||
# 2. full : redeploy a specific release_sha by re-running
|
||||
# deploy_app.yml with that SHA. Works whenever the
|
||||
# tarball is still in the Forgejo Registry. Effect
|
||||
# time : ~5-10 minutes.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env staging | prod
|
||||
# mode fast | full
|
||||
# target_color (mode=fast only) the color to flip TO
|
||||
# release_sha (mode=full only) the SHA to redeploy
|
||||
#
|
||||
# Caller (workflow_dispatch only — see .forgejo/workflows/rollback.yml):
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
|
||||
# -e env={{env}} -e mode=fast -e target_color=blue
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
|
||||
# -e env={{env}} -e mode=full -e release_sha=<previous_sha>
|
||||
---
|
||||
- name: Validate inputs
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Assert env + mode
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env is defined
|
||||
- veza_env in ['staging', 'prod']
|
||||
- mode is defined
|
||||
- mode in ['fast', 'full']
|
||||
fail_msg: rollback.yml requires veza_env + mode (fast|full).
|
||||
quiet: true
|
||||
|
||||
- name: Assert target_color when mode=fast
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- target_color is defined
|
||||
- target_color in ['blue', 'green']
|
||||
fail_msg: rollback.yml mode=fast requires target_color (blue|green).
|
||||
quiet: true
|
||||
when: mode == 'fast'
|
||||
|
||||
- name: Assert release_sha when mode=full
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_release_sha is defined
|
||||
- veza_release_sha | length == 40
|
||||
fail_msg: rollback.yml mode=full requires release_sha (40-char SHA).
|
||||
quiet: true
|
||||
when: mode == 'full'
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# mode=fast → HAProxy flip only.
|
||||
# ---------------------------------------------------------------------
|
||||
- name: Fast rollback — verify target_color containers are alive
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Check each target-color container exists
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
|
||||
if ! incus info "$CT" >/dev/null 2>&1; then
|
||||
echo "MISSING $CT"
|
||||
exit 1
|
||||
fi
|
||||
STATE=$(incus list "$CT" -c s --format csv)
|
||||
if [ "$STATE" != "RUNNING" ]; then
|
||||
echo "$CT is $STATE (not RUNNING)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK $CT"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop:
|
||||
- backend
|
||||
- stream
|
||||
- web
|
||||
changed_when: false
|
||||
register: alive_check
|
||||
when: mode == 'fast'
|
||||
tags: [rollback, fast]
|
||||
|
||||
- name: Fast rollback — flip HAProxy
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_active_color: "{{ target_color }}"
|
||||
# Fast rollback re-uses the previous SHA from the history file.
|
||||
veza_release_sha: "{{ lookup('ansible.builtin.file', '/var/lib/veza/active-color.history', errors='ignore') | regex_search('sha=([0-9a-f]+)', '\\1') | default(['rollback'], true) | first }}"
|
||||
roles:
|
||||
- veza_haproxy_switch
|
||||
when: mode == 'fast'
|
||||
tags: [rollback, fast]
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# mode=full → re-import deploy_app.yml with the rollback SHA.
|
||||
# Functionally identical to a fresh deploy of an older release.
|
||||
# ---------------------------------------------------------------------
|
||||
- name: Full rollback — delegate to deploy_app.yml with release_sha={{ veza_release_sha | default('') }}
|
||||
ansible.builtin.import_playbook: deploy_app.yml
|
||||
when: mode == 'full'
|
||||
tags: [rollback, full]
|
||||
|
|
@ -1,5 +1,16 @@
|
|||
# Managed by Ansible — do not edit by hand.
|
||||
# v1.0.9 W4 Day 19.
|
||||
# v1.0.9 W4 Day 19 (multi-instance) → W5+ extended to blue/green.
|
||||
# `haproxy_topology` (set in group_vars/<env>.yml) selects between:
|
||||
#
|
||||
# multi-instance (default, lab) — server list comes from inventory
|
||||
# groups backend_api_instances, stream_server_instances ; sticky
|
||||
# cookie load-balances across N peers.
|
||||
# blue-green (staging, prod) — server list is exactly two:
|
||||
# <prefix>backend-blue + <prefix>backend-green. veza_active_color
|
||||
# picks which one is primary ; the other is `backup` (HAProxy
|
||||
# routes to a backup server only when ALL primaries are down).
|
||||
# The veza_haproxy_switch role re-renders this template with a
|
||||
# new active_color, validates, atomic-swaps, and HUPs.
|
||||
|
||||
global
|
||||
log /dev/log local0
|
||||
|
|
@ -10,11 +21,7 @@ global
|
|||
user haproxy
|
||||
group haproxy
|
||||
daemon
|
||||
# Avoid leaking the version banner in error pages.
|
||||
server-state-file /var/lib/haproxy/server-state
|
||||
# ssl-default-bind-* tightens TLS to modern ciphers ; lifted directly
|
||||
# from the Mozilla Intermediate profile. Only effective when a TLS
|
||||
# cert is mounted (see haproxy_tls_cert_path).
|
||||
ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11
|
||||
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305
|
||||
|
||||
|
|
@ -23,22 +30,20 @@ defaults
|
|||
mode http
|
||||
option httplog
|
||||
option dontlognull
|
||||
option forwardfor # adds X-Forwarded-For so backend logs see the real IP
|
||||
option forwardfor
|
||||
option http-server-close
|
||||
timeout connect 5s
|
||||
timeout client 60s
|
||||
timeout server 60s
|
||||
timeout tunnel 1h # WS connections are long-lived ; bumped from default 1m
|
||||
timeout tunnel 1h
|
||||
timeout client-fin 5s
|
||||
timeout http-keep-alive 15s
|
||||
timeout http-request 10s
|
||||
# Restore previous server state on reload so health checks don't
|
||||
# restart from scratch + the drain timer survives.
|
||||
load-server-state-from-file global
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Stats endpoint — bound to loopback only so the prometheus haproxy
|
||||
# exporter (sidecar) can scrape it. Auth lives at the bridge layer.
|
||||
# Stats endpoint — bound to loopback only ; the Prometheus haproxy
|
||||
# exporter sidecar scrapes it.
|
||||
# -----------------------------------------------------------------------
|
||||
frontend stats
|
||||
bind 127.0.0.1:{{ haproxy_listen_stats }}
|
||||
|
|
@ -50,8 +55,7 @@ frontend stats
|
|||
no log
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Frontend HTTP. v1.0 lab uses HTTP only ; uncomment the HTTPS bind
|
||||
# when haproxy_tls_cert_path is non-empty (Mozilla intermediate).
|
||||
# Frontend — HTTP + (optionally) HTTPS. ACL-driven path routing.
|
||||
# -----------------------------------------------------------------------
|
||||
frontend veza_http_in
|
||||
bind *:{{ haproxy_listen_http }}
|
||||
|
|
@ -61,23 +65,102 @@ frontend veza_http_in
|
|||
http-request redirect scheme https code 301 if !{ ssl_fc }
|
||||
{% endif %}
|
||||
|
||||
# Path-based routing :
|
||||
# /api/v1/ws/* → backend api_pool (sticky cookie ; carries chat WS)
|
||||
# /api/v1/* → backend api_pool (also sticky so 401 → /me roundtrips work)
|
||||
# /tracks/*/hls → backend stream_pool (URI-hash for cache locality)
|
||||
# else → backend api_pool (default)
|
||||
acl is_api path_beg /api/v1
|
||||
{% if haproxy_topology | default('multi-instance') == 'blue-green' %}
|
||||
acl is_stream_seg path_beg /tracks/ path_end .m3u8
|
||||
acl is_stream_seg path_beg /tracks/ path_end .ts
|
||||
acl is_stream_seg path_beg /tracks/ path_end .m4s
|
||||
acl is_stream_path path_beg /stream
|
||||
acl is_stream_path path_beg /hls
|
||||
use_backend backend_api if is_api
|
||||
use_backend stream_pool if is_stream_seg
|
||||
use_backend stream_pool if is_stream_path
|
||||
default_backend web_pool
|
||||
{% else %}
|
||||
acl is_stream path_beg /tracks/ path_end .m3u8
|
||||
acl is_stream path_beg /tracks/ path_end .ts
|
||||
acl is_stream path_beg /tracks/ path_end .m4s
|
||||
|
||||
use_backend stream_pool if is_stream
|
||||
default_backend api_pool
|
||||
{% endif %}
|
||||
|
||||
{% if haproxy_topology | default('multi-instance') == 'blue-green' %}
|
||||
# =======================================================================
|
||||
# BLUE / GREEN topology (staging, prod)
|
||||
#
|
||||
# active_color is the variable veza_haproxy_switch passes in. It selects
|
||||
# which server gets `check` and which gets `check backup`. HAProxy only
|
||||
# routes to a `backup` server when EVERY non-backup is marked down by
|
||||
# its health check ; together with health-check fall=3 this gives us
|
||||
# instant rollback to the prior color if the new one starts failing
|
||||
# health checks (without re-running Ansible).
|
||||
#
|
||||
# Active color: {{ veza_active_color | default(haproxy_active_color | default('blue')) }}
|
||||
# Container prefix: {{ veza_container_prefix }}
|
||||
# DNS suffix: {{ veza_incus_dns_suffix }}
|
||||
# =======================================================================
|
||||
{% set _active = veza_active_color | default(haproxy_active_color | default('blue')) %}
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Backend API pool — Go. Sticky cookie ; backup color sits idle.
|
||||
# -----------------------------------------------------------------------
|
||||
backend backend_api
|
||||
balance roundrobin
|
||||
option httpchk GET {{ veza_healthcheck_paths.backend | default('/api/v1/health') }}
|
||||
http-check expect status 200
|
||||
cookie {{ haproxy_sticky_cookie_name }} insert indirect nocache httponly secure
|
||||
default-server check
|
||||
inter {{ haproxy_health_check_interval_ms }}
|
||||
fall {{ haproxy_health_check_fall }}
|
||||
rise {{ haproxy_health_check_rise }}
|
||||
on-marked-down shutdown-sessions
|
||||
slowstart {{ haproxy_graceful_drain_seconds }}s
|
||||
server backend_blue {{ veza_container_prefix }}backend-blue.{{ veza_incus_dns_suffix }}:{{ veza_backend_port }} cookie backend_blue {{ '' if _active == 'blue' else 'backup' }}
|
||||
server backend_green {{ veza_container_prefix }}backend-green.{{ veza_incus_dns_suffix }}:{{ veza_backend_port }} cookie backend_green {{ '' if _active == 'green' else 'backup' }}
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Stream pool — Rust Axum HLS. URI-hash for cache locality. Same
|
||||
# blue/green pair, same backup-flag pattern.
|
||||
# -----------------------------------------------------------------------
|
||||
backend stream_pool
|
||||
balance uri whole
|
||||
hash-type consistent
|
||||
option httpchk GET {{ veza_healthcheck_paths.stream | default('/health') }}
|
||||
http-check expect status 200
|
||||
timeout tunnel 1h
|
||||
default-server check
|
||||
inter {{ haproxy_health_check_interval_ms }}
|
||||
fall {{ haproxy_health_check_fall }}
|
||||
rise {{ haproxy_health_check_rise }}
|
||||
on-marked-down shutdown-sessions
|
||||
slowstart {{ haproxy_graceful_drain_seconds }}s
|
||||
server stream_blue {{ veza_container_prefix }}stream-blue.{{ veza_incus_dns_suffix }}:{{ veza_stream_port }} {{ '' if _active == 'blue' else 'backup' }}
|
||||
server stream_green {{ veza_container_prefix }}stream-green.{{ veza_incus_dns_suffix }}:{{ veza_stream_port }} {{ '' if _active == 'green' else 'backup' }}
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Web pool — React SPA served by nginx. Same pair, same pattern.
|
||||
# -----------------------------------------------------------------------
|
||||
backend web_pool
|
||||
balance roundrobin
|
||||
option httpchk GET {{ veza_healthcheck_paths.web | default('/') }}
|
||||
http-check expect status 200
|
||||
default-server check
|
||||
inter {{ haproxy_health_check_interval_ms }}
|
||||
fall {{ haproxy_health_check_fall }}
|
||||
rise {{ haproxy_health_check_rise }}
|
||||
on-marked-down shutdown-sessions
|
||||
slowstart {{ haproxy_graceful_drain_seconds }}s
|
||||
server web_blue {{ veza_container_prefix }}web-blue.{{ veza_incus_dns_suffix }}:{{ veza_web_port }} {{ '' if _active == 'blue' else 'backup' }}
|
||||
server web_green {{ veza_container_prefix }}web-green.{{ veza_incus_dns_suffix }}:{{ veza_web_port }} {{ '' if _active == 'green' else 'backup' }}
|
||||
|
||||
{% else %}
|
||||
# =======================================================================
|
||||
# MULTI-INSTANCE topology (lab, default)
|
||||
# Server list comes from inventory groups ; sticky cookie load-balances.
|
||||
# =======================================================================
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Backend api_pool — Gin REST API. Sticky cookie + active health check.
|
||||
# `cookie ... insert indirect nocache` : HAProxy sets the cookie on the
|
||||
# first response, the browser sends it back, subsequent requests stick
|
||||
# to the same server. WS upgrades inherit it.
|
||||
# -----------------------------------------------------------------------
|
||||
backend api_pool
|
||||
balance roundrobin
|
||||
|
|
@ -98,9 +181,7 @@ backend api_pool
|
|||
|
||||
# -----------------------------------------------------------------------
|
||||
# Backend stream_pool — Rust Axum HLS. URI hash so the same track_id
|
||||
# consistently lands on the same node, keeping the in-process HLS
|
||||
# segment cache warm. `consistent` flag = jump-hash so adding/removing
|
||||
# a node doesn't flush the entire pool.
|
||||
# consistently lands on the same node.
|
||||
# -----------------------------------------------------------------------
|
||||
backend stream_pool
|
||||
balance uri whole
|
||||
|
|
@ -118,3 +199,5 @@ backend stream_pool
|
|||
{% for host in stream_hosts %}
|
||||
server {{ host }} {{ host }}.lxd:{{ haproxy_stream_server_port }}
|
||||
{% endfor %}
|
||||
|
||||
{% endif %}
|
||||
|
|
|
|||
|
|
@ -35,7 +35,9 @@ veza_app_binary_mode: "0755"
|
|||
veza_app_container_name: "{{ veza_container_prefix }}{{ veza_component }}-{{ veza_target_color }}"
|
||||
|
||||
# URL to fetch the release tarball. Computed once per task chain.
|
||||
veza_app_artifact_url: "{{ veza_artifact_base_url }}/{{ veza_component }}/{{ veza_release_sha }}/veza-{{ veza_component }}-{{ veza_release_sha }}.tar.zst"
|
||||
# `veza-<component>` is the Forgejo package name (one package per
|
||||
# component) ; SHA is the version ; tarball is the filename.
|
||||
veza_app_artifact_url: "{{ veza_artifact_base_url }}/veza-{{ veza_component }}/{{ veza_release_sha }}/veza-{{ veza_component }}-{{ veza_release_sha }}.tar.zst"
|
||||
|
||||
# How long to wait for the container's network namespace to come up
|
||||
# after `incus launch` before we start running tasks against it.
|
||||
|
|
|
|||
47
infra/ansible/roles/veza_haproxy_switch/README.md
Normal file
47
infra/ansible/roles/veza_haproxy_switch/README.md
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
# `veza_haproxy_switch` role
|
||||
|
||||
Atomically swap HAProxy's active color. Runs against the
|
||||
`{{ veza_container_prefix }}haproxy` container after `veza_app` has
|
||||
recreated + health-probed all three components in the inactive color.
|
||||
|
||||
## Why a separate role from `haproxy`?
|
||||
|
||||
- `roles/haproxy` provisions a fresh HAProxy container — install
|
||||
the package, lay down the *initial* config, enable the systemd
|
||||
unit. It runs once when the staging/prod env is bootstrapped and
|
||||
occasionally when the global config shape changes.
|
||||
- `roles/veza_haproxy_switch` performs the *per-deploy* delta —
|
||||
re-template the cfg with a new `veza_active_color`, validate,
|
||||
swap, HUP. It runs once at the end of every successful deploy.
|
||||
|
||||
Splitting them keeps the per-deploy path narrow (no apt, no service
|
||||
install) and lets `roles/haproxy` remain idempotent when the global
|
||||
shape hasn't changed.
|
||||
|
||||
## Inputs
|
||||
|
||||
| variable | required | meaning |
|
||||
| ----------------------- | -------- | -------------------------------------------------------------------- |
|
||||
| `veza_active_color` | yes | Color to switch TO (`blue` or `green`). Becomes the new active. |
|
||||
| `veza_release_sha` | yes | SHA being deployed. Logged in the active-color history file. |
|
||||
| `veza_container_prefix` | inherit | From group_vars/<env>.yml. |
|
||||
| `haproxy_topology` | inherit | Should be `blue-green` for this role to make sense. |
|
||||
|
||||
## Failure semantics
|
||||
|
||||
The render → validate → atomic-swap → HUP sequence runs in an
|
||||
Ansible `block:` with a `rescue:` that restores `haproxy.cfg.bak`
|
||||
(captured before the swap) and re-HUPs. So an invalid config or a
|
||||
HUP failure leaves HAProxy serving the *previous* active color
|
||||
exactly as before — the deploy as a whole then fails on the playbook
|
||||
level.
|
||||
|
||||
## What the role does NOT do
|
||||
|
||||
- It does not destroy or recreate the HAProxy container. That's a
|
||||
one-time operation under `roles/haproxy`.
|
||||
- It does not touch app containers — by the time this role runs,
|
||||
blue/green app containers are both healthy.
|
||||
- It does not remove the previously-active color's containers. They
|
||||
survive (intentional) so a rollback can flip back instantly. The
|
||||
next deploy naturally recycles them.
|
||||
18
infra/ansible/roles/veza_haproxy_switch/defaults/main.yml
Normal file
18
infra/ansible/roles/veza_haproxy_switch/defaults/main.yml
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
---
|
||||
# These should be set by the caller — defaults here are guards that
|
||||
# fail loud if the caller forgot to pass them.
|
||||
veza_active_color: ""
|
||||
veza_release_sha: ""
|
||||
|
||||
# Paths inside the HAProxy container.
|
||||
haproxy_cfg_path: /etc/haproxy/haproxy.cfg
|
||||
haproxy_cfg_new_path: /etc/haproxy/haproxy.cfg.new
|
||||
haproxy_cfg_backup_path: /etc/haproxy/haproxy.cfg.bak
|
||||
haproxy_state_dir: /var/lib/veza
|
||||
haproxy_active_color_file: /var/lib/veza/active-color
|
||||
haproxy_active_color_history: /var/lib/veza/active-color.history
|
||||
|
||||
# How many history entries to keep before pruning. The rollback role
|
||||
# offers point-in-time switch within this window without redeploying
|
||||
# the artefact.
|
||||
haproxy_active_color_history_keep: 5
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
---
|
||||
# HUP haproxy via systemd reload (graceful — drains old workers).
|
||||
# Used both on success (after atomic swap) and on rescue (after
|
||||
# restoring backup).
|
||||
- name: Reload haproxy
|
||||
ansible.builtin.systemd:
|
||||
name: haproxy
|
||||
state: reloaded
|
||||
listen: "veza-haproxy reload"
|
||||
16
infra/ansible/roles/veza_haproxy_switch/meta/main.yml
Normal file
16
infra/ansible/roles/veza_haproxy_switch/meta/main.yml
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
---
|
||||
galaxy_info:
|
||||
role_name: veza_haproxy_switch
|
||||
author: Veza Ops
|
||||
description: >-
|
||||
Atomically swap HAProxy's active color (blue/green) and persist
|
||||
the new state. Runs once per deploy, after veza_app has health-
|
||||
probed all components in the inactive color. Block/rescue
|
||||
guarantees HAProxy never lands on a bad config.
|
||||
license: proprietary
|
||||
min_ansible_version: "2.15"
|
||||
platforms:
|
||||
- name: Debian
|
||||
versions: ["13"]
|
||||
|
||||
dependencies: []
|
||||
142
infra/ansible/roles/veza_haproxy_switch/tasks/main.yml
Normal file
142
infra/ansible/roles/veza_haproxy_switch/tasks/main.yml
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
# Atomic blue/green switch. The HAProxy template lives in
|
||||
# roles/haproxy/templates/haproxy.cfg.j2 — it reads veza_active_color
|
||||
# to render the right `backup` directives. We re-template, validate,
|
||||
# atomic-swap, HUP.
|
||||
#
|
||||
# Block/rescue: any failure in the four-step sequence restores
|
||||
# haproxy.cfg from the backup we capture before touching anything.
|
||||
# That way, an invalid template or a HUP error never leaves HAProxy
|
||||
# serving from a stale or broken cfg — it stays on whatever was
|
||||
# active when the role started.
|
||||
---
|
||||
- name: Validate inputs
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_active_color in ['blue', 'green']
|
||||
- veza_release_sha | length == 40
|
||||
fail_msg: >-
|
||||
veza_haproxy_switch role requires veza_active_color (blue|green)
|
||||
and veza_release_sha (40-char git SHA). Got: color={{ veza_active_color }}
|
||||
sha={{ veza_release_sha }}.
|
||||
quiet: true
|
||||
tags: [veza_haproxy_switch, always]
|
||||
|
||||
- name: Ensure veza state dir exists in HAProxy container
|
||||
ansible.builtin.file:
|
||||
path: "{{ haproxy_state_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Read currently-active color (if any)
|
||||
ansible.builtin.slurp:
|
||||
src: "{{ haproxy_active_color_file }}"
|
||||
register: prior_color_raw
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Resolve prior_active_color (default blue if no history)
|
||||
ansible.builtin.set_fact:
|
||||
prior_active_color: >-
|
||||
{{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
|
||||
else 'blue' }}
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Switch sequence (block/rescue — restores cfg on any failure)
|
||||
block:
|
||||
- name: Backup current haproxy.cfg
|
||||
ansible.builtin.copy:
|
||||
src: "{{ haproxy_cfg_path }}"
|
||||
dest: "{{ haproxy_cfg_backup_path }}"
|
||||
remote_src: true
|
||||
mode: "0640"
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Render fresh haproxy.cfg with new active_color
|
||||
ansible.builtin.template:
|
||||
src: "{{ playbook_dir }}/../roles/haproxy/templates/haproxy.cfg.j2"
|
||||
dest: "{{ haproxy_cfg_new_path }}"
|
||||
owner: root
|
||||
group: haproxy
|
||||
mode: "0640"
|
||||
validate: "haproxy -f %s -c -q"
|
||||
vars:
|
||||
# Make absolutely sure the template sees the new color we are
|
||||
# switching to — set both names because the older template
|
||||
# used `veza_active_color` and a future revision might use
|
||||
# `haproxy_active_color`.
|
||||
haproxy_active_color: "{{ veza_active_color }}"
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Atomic swap — mv haproxy.cfg.new → haproxy.cfg
|
||||
ansible.builtin.command: mv -f "{{ haproxy_cfg_new_path }}" "{{ haproxy_cfg_path }}"
|
||||
changed_when: true
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: HUP haproxy (graceful reload, no connection drop)
|
||||
ansible.builtin.systemd:
|
||||
name: haproxy
|
||||
state: reloaded
|
||||
tags: [veza_haproxy_switch]
|
||||
rescue:
|
||||
- name: Restore haproxy.cfg from backup
|
||||
ansible.builtin.command: mv -f "{{ haproxy_cfg_backup_path }}" "{{ haproxy_cfg_path }}"
|
||||
when: haproxy_cfg_backup_path is file or true # always try; benign if backup missing
|
||||
changed_when: true
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: HUP haproxy back to the prior config
|
||||
ansible.builtin.systemd:
|
||||
name: haproxy
|
||||
state: reloaded
|
||||
failed_when: false
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Report the failure
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
HAProxy switch to color {{ veza_active_color }} (sha
|
||||
{{ veza_release_sha[:12] }}) failed — config rolled back
|
||||
to the prior state. HAProxy continues serving from
|
||||
{{ prior_active_color }}. Inspect the validate step's
|
||||
stderr in the playbook output above.
|
||||
|
||||
# Success path: persist new active color + history.
|
||||
- name: Write new active color
|
||||
ansible.builtin.copy:
|
||||
dest: "{{ haproxy_active_color_file }}"
|
||||
content: "{{ veza_active_color }}\n"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Append to active-color history
|
||||
ansible.builtin.lineinfile:
|
||||
path: "{{ haproxy_active_color_history }}"
|
||||
line: "{{ ansible_date_time.iso8601 }} sha={{ veza_release_sha }} color={{ veza_active_color }} prior={{ prior_active_color }}"
|
||||
create: true
|
||||
insertbefore: BOF
|
||||
mode: "0644"
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Prune history beyond keep limit
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
if [ -f "{{ haproxy_active_color_history }}" ]; then
|
||||
head -n {{ haproxy_active_color_history_keep }} "{{ haproxy_active_color_history }}" > "{{ haproxy_active_color_history }}.tmp"
|
||||
mv -f "{{ haproxy_active_color_history }}.tmp" "{{ haproxy_active_color_history }}"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
tags: [veza_haproxy_switch]
|
||||
|
||||
- name: Drop the now-stale backup
|
||||
ansible.builtin.file:
|
||||
path: "{{ haproxy_cfg_backup_path }}"
|
||||
state: absent
|
||||
tags: [veza_haproxy_switch]
|
||||
|
|
@ -198,3 +198,22 @@ incus-logs: ## [LOW] Show logs from Incus container (usage: make incus-logs SERV
|
|||
exit 1; \
|
||||
fi
|
||||
@incus exec veza-$(SERVICE) -- journalctl -f
|
||||
|
||||
# ==============================================================================
|
||||
# CANARY RELEASE (W5 Day 23)
|
||||
# ==============================================================================
|
||||
|
||||
.PHONY: deploy-canary
|
||||
|
||||
deploy-canary: ## [HIGH] Canary release : drain → deploy → SLI monitor → rollback on red. ARTIFACT=/path required. See docs/CANARY_RELEASE.md.
|
||||
@if [ -z "$(ARTIFACT)" ]; then \
|
||||
$(ECHO_CMD) "${RED}❌ ARTIFACT=/path/to/veza-api required${NC}"; \
|
||||
$(ECHO_CMD) "${YELLOW} See docs/CANARY_RELEASE.md for the full env-var surface.${NC}"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@$(ECHO_CMD) "${BLUE}🚦 Canary deploy : $(ARTIFACT)${NC}"
|
||||
@ARTIFACT="$(ARTIFACT)" \
|
||||
ROLLBACK_BINARY="$(ROLLBACK_BINARY)" \
|
||||
SLI_WINDOW="$(SLI_WINDOW)" \
|
||||
PROM_URL="$(PROM_URL)" \
|
||||
bash $(CURDIR)/scripts/deploy-canary.sh
|
||||
|
|
|
|||
112
scripts/check-migration-backward-compat.sh
Executable file
112
scripts/check-migration-backward-compat.sh
Executable file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/env bash
|
||||
# check-migration-backward-compat.sh — pre-deploy gate for canary releases.
|
||||
#
|
||||
# Refuses to deploy when the latest migration is NOT backward-compatible
|
||||
# with the running schema. Backward-compat = the OLD code can still
|
||||
# read/write against the NEW schema for at least one canary window
|
||||
# (otherwise canary mode is meaningless ; the old node would crash on
|
||||
# the first request that touches a removed column).
|
||||
#
|
||||
# Heuristic : reject migrations that contain any of these patterns :
|
||||
# - DROP COLUMN
|
||||
# - DROP TABLE
|
||||
# - ALTER COLUMN ... TYPE (type change is rarely backward-compat)
|
||||
# - ADD COLUMN ... NOT NULL (without DEFAULT — old code can't INSERT)
|
||||
# - DROP CONSTRAINT
|
||||
# - DROP INDEX UNIQUE (existing data may already violate)
|
||||
#
|
||||
# This is a STATIC check ; some patterns are false-positives (e.g.
|
||||
# DROP COLUMN of a column that no code reads). When a real migration
|
||||
# is flagged, the operator either :
|
||||
# 1. Splits the migration : ship the additive part now, drop in v+1
|
||||
# after old-version backends are decommissioned.
|
||||
# 2. Bypasses with FORCE_MIGRATE=1 + a justification in the commit
|
||||
# message of the migration file.
|
||||
#
|
||||
# v1.0.9 W5 Day 23.
|
||||
#
|
||||
# Usage :
|
||||
# bash scripts/check-migration-backward-compat.sh
|
||||
#
|
||||
# Required env :
|
||||
# MIGRATIONS_DIR default veza-backend-api/migrations
|
||||
# GIT_RANGE default origin/main..HEAD ; the range to inspect for
|
||||
# newly-added migration files
|
||||
# Optional env :
|
||||
# FORCE_MIGRATE=1 bypass with a logged warning. Use sparingly.
|
||||
#
|
||||
# Exit codes :
|
||||
# 0 — all new migrations are backward-compat (or FORCE_MIGRATE=1)
|
||||
# 1 — at least one migration carries a forbidden pattern
|
||||
# 3 — required tool missing / config error
|
||||
set -euo pipefail
|
||||
|
||||
MIGRATIONS_DIR=${MIGRATIONS_DIR:-veza-backend-api/migrations}
|
||||
GIT_RANGE=${GIT_RANGE:-origin/main..HEAD}
|
||||
FORCE_MIGRATE=${FORCE_MIGRATE:-0}
|
||||
|
||||
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
|
||||
fail() { log "FAIL: $*"; exit "${2:-1}"; }
|
||||
|
||||
require() {
|
||||
command -v "$1" >/dev/null 2>&1 || fail "required tool missing: $1" 3
|
||||
}
|
||||
|
||||
require git
|
||||
require grep
|
||||
require date
|
||||
|
||||
# Patterns that indicate non-backward-compat schema change.
|
||||
# Heredoc preserves the pipe characters as alternations.
|
||||
FORBIDDEN_PATTERNS='DROP COLUMN|DROP TABLE|ALTER COLUMN [A-Za-z_]+ TYPE|ADD COLUMN [A-Za-z_]+ [^,;]* NOT NULL[^,;]*(;|$)|DROP CONSTRAINT|DROP INDEX [A-Za-z_]*UNIQUE'
|
||||
|
||||
# Identify newly-added migration files in the current range.
|
||||
new_migrations=$(git diff --name-only --diff-filter=A "$GIT_RANGE" -- "$MIGRATIONS_DIR" 2>/dev/null \
|
||||
| grep -E "^${MIGRATIONS_DIR}/[0-9]+_.*\.sql$" || true)
|
||||
|
||||
if [ -z "$new_migrations" ]; then
|
||||
log "no new migrations in $GIT_RANGE — nothing to check"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "checking $(echo "$new_migrations" | wc -l) new migration(s) in $GIT_RANGE"
|
||||
findings=0
|
||||
for f in $new_migrations; do
|
||||
log " scanning $f"
|
||||
# -i case-insensitive ; -E extended regex ; -n line numbers
|
||||
matches=$(grep -inE "$FORBIDDEN_PATTERNS" "$f" || true)
|
||||
if [ -n "$matches" ]; then
|
||||
findings=$((findings + 1))
|
||||
log ""
|
||||
log " ⚠ NON-BACKWARD-COMPAT pattern in $f :"
|
||||
echo "$matches" | sed 's/^/ /' >&2
|
||||
# Special case : ADD COLUMN ... NOT NULL ... DEFAULT <x> is fine.
|
||||
# The regex above tries to exclude that but the match-then-filter
|
||||
# approach is more reliable than a single regex. Suppress matches
|
||||
# that include `DEFAULT` on the same line.
|
||||
real=$(echo "$matches" | grep -ivE "DEFAULT" || true)
|
||||
if [ -z "$real" ]; then
|
||||
log " ↳ all matches include DEFAULT clause — actually backward-compat"
|
||||
findings=$((findings - 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$findings" -gt 0 ]; then
|
||||
log ""
|
||||
log "$findings migration(s) flagged as potentially non-backward-compat."
|
||||
if [ "$FORCE_MIGRATE" = "1" ]; then
|
||||
log "FORCE_MIGRATE=1 set — proceeding anyway."
|
||||
exit 0
|
||||
fi
|
||||
log ""
|
||||
log "Options to proceed :"
|
||||
log " 1. Split the migration : ship the additive part now, drop the"
|
||||
log " non-compat part in v+1 after old backends are off."
|
||||
log " 2. Set FORCE_MIGRATE=1 if you accept the risk + document the"
|
||||
log " justification in the migration's commit message."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "PASS : all new migrations are backward-compat"
|
||||
exit 0
|
||||
287
scripts/deploy-canary.sh
Executable file
287
scripts/deploy-canary.sh
Executable file
|
|
@ -0,0 +1,287 @@
|
|||
#!/usr/bin/env bash
|
||||
# deploy-canary.sh — canary release for the active/active backend-api fleet.
|
||||
#
|
||||
# Walks the standard canary recipe (drain → deploy → health → re-enable
|
||||
# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
|
||||
# the host that owns the backend-api Incus containers + the haproxy
|
||||
# admin socket.
|
||||
#
|
||||
# v1.0.9 W5 Day 23.
|
||||
#
|
||||
# Usage :
|
||||
# bash scripts/deploy-canary.sh /path/to/new/veza-api
|
||||
#
|
||||
# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
|
||||
#
|
||||
# Required env :
|
||||
# ARTIFACT path to the new veza-api binary (passed as $1 too)
|
||||
# Optional env :
|
||||
# POOL_BACKEND HAProxy backend name (default api_pool)
|
||||
# CANARY_NODE which container to canary first (default backend-api-2)
|
||||
# PEER_NODES comma-separated list of peers to roll AFTER canary
|
||||
# succeeds (default backend-api-1)
|
||||
# HEALTH_HOST host to curl (default haproxy.lxd ; LB-routed)
|
||||
# HEALTH_PATH default /api/v1/health
|
||||
# SLI_WINDOW SLI monitor duration in seconds (default 3600 = 1h)
|
||||
# SLI_PROBE_INTERVAL seconds between SLI probes (default 30)
|
||||
# PROM_URL Prometheus query URL (default http://prom.lxd:9090)
|
||||
# PROM_P95_THRESHOLD_S p95 SLI threshold in seconds (default 0.5)
|
||||
# PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
|
||||
# ROLLBACK_BINARY path to the previous-known-good binary (used on red)
|
||||
# If unset, rollback skips the binary swap and just
|
||||
# re-enables the canary node — operator handles the
|
||||
# real revert.
|
||||
# PRE_DEPLOY_HOOK path to script that validates migrations are
|
||||
# backward-compat. Defaults to scripts/check-migration-backward-compat.sh
|
||||
# when present.
|
||||
#
|
||||
# Exit codes :
|
||||
# 0 — canary + full roll succeeded
|
||||
# 1 — pre-deploy validation failed ; nothing was changed
|
||||
# 2 — canary failed ; rollback executed
|
||||
# 3 — required tool / env missing
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
|
||||
ARTIFACT=${ARTIFACT:-${1:-?}}
|
||||
POOL_BACKEND=${POOL_BACKEND:-api_pool}
|
||||
CANARY_NODE=${CANARY_NODE:-backend-api-2}
|
||||
PEER_NODES=${PEER_NODES:-backend-api-1}
|
||||
HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
|
||||
HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
|
||||
SLI_WINDOW=${SLI_WINDOW:-3600}
|
||||
SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
|
||||
PROM_URL=${PROM_URL:-http://prom.lxd:9090}
|
||||
PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
|
||||
PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
|
||||
ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
|
||||
PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}
|
||||
|
||||
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
|
||||
die() { log "FAIL: $*"; exit "${2:-1}"; }
|
||||
|
||||
require() {
|
||||
command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
|
||||
}
|
||||
|
||||
require incus
|
||||
require curl
|
||||
require socat
|
||||
require date
|
||||
|
||||
if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
|
||||
die "ARTIFACT (or \$1) must point to an existing binary" 1
|
||||
fi
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Helpers : HAProxy admin socket commands.
|
||||
# --------------------------------------------------------------------
|
||||
HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}
|
||||
|
||||
ha_cmd() {
|
||||
incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
|
||||
}
|
||||
|
||||
ha_state() {
|
||||
local node=$1
|
||||
ha_cmd "show servers state $POOL_BACKEND" \
|
||||
| awk -v n="$node" '$0 ~ n {print $7}' | head -1
|
||||
# field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
|
||||
}
|
||||
|
||||
ha_drain() {
|
||||
log "haproxy : drain $1"
|
||||
ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
|
||||
}
|
||||
|
||||
ha_ready() {
|
||||
log "haproxy : ready $1"
|
||||
ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
|
||||
}
|
||||
|
||||
ha_wait_drained() {
|
||||
# Drain finishes when the server reports 0 active connections.
|
||||
local node=$1
|
||||
local deadline=$(( $(date +%s) + 60 ))
|
||||
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||||
local n
|
||||
n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
|
||||
if [ "${n:-0}" = "0" ]; then
|
||||
log "haproxy : $node drained (0 active connections)"
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
|
||||
}
|
||||
|
||||
curl_health() {
|
||||
curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
|
||||
"http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
|
||||
}
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
|
||||
# any probe reports red so we can rollback fast.
|
||||
# --------------------------------------------------------------------
|
||||
prom_query() {
|
||||
local q=$1
|
||||
curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
|
||||
"${PROM_URL}/api/v1/query" 2>/dev/null \
|
||||
| jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
|
||||
}
|
||||
|
||||
monitor_sli() {
|
||||
log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
|
||||
local deadline=$(( $(date +%s) + SLI_WINDOW ))
|
||||
local probes=0
|
||||
local first_red=""
|
||||
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||||
probes=$((probes + 1))
|
||||
local p95 err
|
||||
p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
|
||||
err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
|
||||
log " probe $probes : p95=${p95}s err=${err}"
|
||||
|
||||
# awk used for float comparison ; bash test only does integers.
|
||||
if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
|
||||
first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
|
||||
break
|
||||
fi
|
||||
if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
|
||||
first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
|
||||
break
|
||||
fi
|
||||
sleep "$SLI_PROBE_INTERVAL"
|
||||
done
|
||||
if [ -n "$first_red" ]; then
|
||||
log "SLI red after $probes probe(s) : $first_red"
|
||||
return 1
|
||||
fi
|
||||
log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
|
||||
return 0
|
||||
}
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Deploy + rollback primitives.
|
||||
# --------------------------------------------------------------------
|
||||
deploy_to() {
|
||||
local node=$1
|
||||
local artifact=$2
|
||||
log "deploying $artifact → $node"
|
||||
incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
|
||||
--uid 1001 --gid 1001 --mode 0755
|
||||
incus exec "$node" -- systemctl restart veza-backend-api
|
||||
}
|
||||
|
||||
verify_node_health() {
|
||||
local node=$1
|
||||
log "node health check : $node"
|
||||
local deadline=$(( $(date +%s) + 60 ))
|
||||
while [ "$(date +%s)" -lt "$deadline" ]; do
|
||||
if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
|
||||
log " $node : 200"
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
rollback_canary() {
|
||||
log "ROLLBACK : restoring $CANARY_NODE"
|
||||
if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
|
||||
deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
|
||||
verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
|
||||
else
|
||||
log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
|
||||
fi
|
||||
ha_ready "$CANARY_NODE"
|
||||
}
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 1. Pre-deploy hook (migration backward-compat).
|
||||
# --------------------------------------------------------------------
|
||||
log "step 1 : pre-deploy hook"
|
||||
if [ -x "$PRE_DEPLOY_HOOK" ]; then
|
||||
if ! "$PRE_DEPLOY_HOOK"; then
|
||||
die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
|
||||
fi
|
||||
else
|
||||
log " PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
|
||||
fi
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 2. Drain canary node.
|
||||
# --------------------------------------------------------------------
|
||||
log "step 2 : drain $CANARY_NODE in HAProxy"
|
||||
ha_drain "$CANARY_NODE"
|
||||
ha_wait_drained "$CANARY_NODE"
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 3. Deploy artifact to the canary node.
|
||||
# --------------------------------------------------------------------
|
||||
log "step 3 : deploy artifact to $CANARY_NODE"
|
||||
deploy_to "$CANARY_NODE" "$ARTIFACT"
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 4. Per-node health check.
|
||||
# --------------------------------------------------------------------
|
||||
log "step 4 : health check on $CANARY_NODE"
|
||||
if ! verify_node_health "$CANARY_NODE"; then
|
||||
log "$CANARY_NODE failed health check post-deploy"
|
||||
rollback_canary
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
|
||||
# --------------------------------------------------------------------
|
||||
log "step 5 : re-enable $CANARY_NODE in HAProxy"
|
||||
ha_ready "$CANARY_NODE"
|
||||
sleep 5
|
||||
lb_status=$(curl_health)
|
||||
if [ "$lb_status" != "200" ]; then
|
||||
log "LB health check after re-enable returned $lb_status ; rolling back"
|
||||
rollback_canary
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
|
||||
# --------------------------------------------------------------------
|
||||
log "step 6 : monitor SLI on the canary"
|
||||
if ! monitor_sli; then
|
||||
log "SLI red — rolling back the canary"
|
||||
rollback_canary
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# 7. SLI green — repeat on each peer.
|
||||
# --------------------------------------------------------------------
|
||||
log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
|
||||
IFS=',' read -ra peers <<< "$PEER_NODES"
|
||||
for peer in "${peers[@]}"; do
|
||||
log "── peer $peer ───────────────────────────"
|
||||
ha_drain "$peer"
|
||||
ha_wait_drained "$peer"
|
||||
deploy_to "$peer" "$ARTIFACT"
|
||||
if ! verify_node_health "$peer"; then
|
||||
log "$peer health check failed post-deploy"
|
||||
log "WARN : leaving $peer drained ; canary node still serves traffic"
|
||||
log " operator must re-deploy known-good binary or repair $peer manually"
|
||||
exit 2
|
||||
fi
|
||||
ha_ready "$peer"
|
||||
sleep 5
|
||||
lb_status=$(curl_health)
|
||||
if [ "$lb_status" != "200" ]; then
|
||||
log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
|
||||
exit 0
|
||||
Loading…
Reference in a new issue