From 8200eeba6e0f465c0c97270fb9bbd474b3e0d55c Mon Sep 17 00:00:00 2001
From: senke <okin.tcs@gmail.com>
Date: Wed, 29 Apr 2026 14:41:14 +0200
Subject: [PATCH] chore(ansible): recover group_vars files lost in
 parallel-commit shuffle

Files originally part of the "split group_vars into all/{main,vault}"
commit got dropped during a rebase/amend when parallel session work
landed on the same area at the same time. The all/main.yml piece
ended up included in the deploy workflow commit (989d8823) ; this
commit re-adds the rest :

  infra/ansible/group_vars/all/vault.yml.example
  infra/ansible/group_vars/staging.yml
  infra/ansible/group_vars/prod.yml
  infra/ansible/group_vars/README.md
  + delete infra/ansible/group_vars/all.yml (superseded by all/main.yml)

Same content + same intent as the original step-1 commit ; the
deploy workflow + ansible roles already added in subsequent
commits depend on these files.

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/CANARY_RELEASE.md                        | 111 +++++++
 infra/ansible/group_vars/README.md            |  67 ++++
 infra/ansible/group_vars/all.yml              |  40 ---
 .../ansible/group_vars/all/vault.yml.example  |  78 +++++
 infra/ansible/group_vars/prod.yml             |  42 +++
 infra/ansible/group_vars/staging.yml          |  67 ++++
 make/incus.mk                                 |  19 ++
 scripts/check-migration-backward-compat.sh    | 112 +++++++
 scripts/deploy-canary.sh                      | 287 ++++++++++++++++++
 9 files changed, 783 insertions(+), 40 deletions(-)
 create mode 100644 docs/CANARY_RELEASE.md
 create mode 100644 infra/ansible/group_vars/README.md
 delete mode 100644 infra/ansible/group_vars/all.yml
 create mode 100644 infra/ansible/group_vars/all/vault.yml.example
 create mode 100644 infra/ansible/group_vars/prod.yml
 create mode 100644 infra/ansible/group_vars/staging.yml
 create mode 100755 scripts/check-migration-backward-compat.sh
 create mode 100755 scripts/deploy-canary.sh

diff --git a/docs/CANARY_RELEASE.md b/docs/CANARY_RELEASE.md
new file mode 100644
index 000000000..484f946bc
--- /dev/null
+++ b/docs/CANARY_RELEASE.md
@@ -0,0 +1,111 @@
+# Canary release — backend-api
+
+> **Audience** : on-call engineer running a release.
+> **Trigger** : a new backend-api binary signed-off for prod.
+> **Owner** : whoever's on the deploy rota that day.
+
+The canary recipe ships the new binary to **one** backend at a time, watches the SLI for a window, and only continues to the next backend when the SLI stays green. If the SLI breaches at any point, the canary node rolls back automatically to the last-known-good binary.
+
+## Trigger conditions
+
+Run the canary script when one of these is true :
+
+- A normal feature release. New code path, no schema migration that requires lockstep coordination.
+- A hot-fix on a Sev-2 or below issue. Sev-1 (security or data-integrity) follows the all-stop rotate path documented in `docs/runbooks/INCIDENT_RESPONSE.md` instead.
+
+## Pre-flight checklist
+
+- [ ] **Migration backward-compat** : the latest schema migration is additive only — no `DROP COLUMN`, no `ALTER COLUMN ... TYPE`, no `ADD COLUMN ... NOT NULL` without `DEFAULT`. The script's pre-deploy hook (`scripts/check-migration-backward-compat.sh`) refuses to proceed when it finds one ; bypass with `FORCE_MIGRATE=1` only after you've split the migration in your head.
+- [ ] **Last-known-good binary** is preserved. Either : (a) the previous release's `veza-api` is still on the host at `/opt/veza/backend-api/veza-api.previous`, OR (b) you have it locally and pass `ROLLBACK_BINARY=/path/to/old/veza-api` as env to the script.
+- [ ] **Prometheus reachable** from the deploy host. The SLI monitor queries `${PROM_URL}` (default `http://prom.lxd:9090`) every `${SLI_PROBE_INTERVAL}` seconds for 1 hour.
+- [ ] **HAProxy admin socket reachable** : the script execs into the haproxy Incus container to drive `set server ${POOL}/${NODE} state drain|ready` via socat.
+- [ ] **No game day in the same window.** Canary needs a quiet baseline ; chaos drills will push the SLI red and trigger a false rollback.
+
+## How
+
+### One-shot via Make
+
+```bash
+make deploy-canary ARTIFACT=/tmp/veza-api-v1.0.10
+```
+
+The Make target wraps the script with reasonable defaults. Override any env (see the script header) by exporting before the `make` call.
+
+### Direct script invocation
+
+```bash
+ARTIFACT=/tmp/veza-api-v1.0.10 \
+ROLLBACK_BINARY=/opt/veza/backend-api/veza-api.previous \
+SLI_WINDOW=3600 \
+PROM_URL=http://prom.lxd:9090 \
+bash scripts/deploy-canary.sh
+```
+
+The script is idempotent on the steps that matter : draining an already-drained server is a no-op ; pushing the same binary twice is a no-op (file mtime invariant). Re-runs after a partial failure are safe.
+
+## What happens, in order
+
+1. **Pre-deploy hook** runs `scripts/check-migration-backward-compat.sh` on the new-since-`origin/main` migration files. Forbidden patterns abort the deploy.
+2. **Drain `CANARY_NODE`** (default `backend-api-2`) via the HAProxy admin socket. Wait until the node has 0 active connections.
+3. **Push the binary** to `/opt/veza/backend-api/veza-api` on the canary container. `systemctl restart veza-backend-api`.
+4. **Per-node health check** : `curl http://127.0.0.1:8080/api/v1/health` from inside the container. If the node doesn't return 200 within 60 s, rollback.
+5. **Re-enable** the canary node in HAProxy.
+6. **LB-side health check** : `curl http://haproxy.lxd${HEALTH_PATH}` returns 200 (proves HAProxy sees the node ready and routes through it).
+7. **SLI monitor** for `SLI_WINDOW` seconds (default 3600 = 1h). Probes Prometheus every `SLI_PROBE_INTERVAL` (default 30 s) for :
+    - p95 of `veza_gin_http_request_duration_seconds_bucket` < `PROM_P95_THRESHOLD_S` (0.5 s)
+    - error rate (5xx ÷ total) < `PROM_ERR_RATE_THRESHOLD` (0.005 = 0.5%)
+   First red probe → rollback.
+8. **Roll the peers** : for each `PEER_NODES` entry (default `backend-api-1`), repeat steps 2–6 (drain → deploy → health → re-enable → LB check). The peer roll skips the SLI monitor because the canary already proved the SLI ; if a peer-specific failure happens (binary corrupt on push, container disk full), the script bails out.
+
+## Rollback path
+
+The script handles the canary rollback automatically when :
+
+- The pre-deploy hook fails. Nothing is changed ; nothing to revert.
+- The canary's health check fails after the deploy. Old binary restored from `ROLLBACK_BINARY`, canary re-enabled.
+- The SLI breaches during the monitor window. Same as above.
+
+The script does **NOT** rollback peers automatically — by the time peers are rolling, the canary has already accumulated a green-SLI window. A peer health failure is an artifact of the deploy step (corrupt push, container memory issue), not of the new binary itself, and re-running after fixing the local issue is safer than ping-ponging the binary.
+
+## Manual rollback (full)
+
+When the script doesn't catch the regression — say a slow leak that surfaces after the SLI window closes — the on-call manually drives :
+
+```bash
+# Find which backend is on the new binary :
+incus exec backend-api-1 -- ls -la /opt/veza/backend-api/veza-api
+incus exec backend-api-2 -- ls -la /opt/veza/backend-api/veza-api
+
+# Rotate both back to the previous binary :
+for ct in backend-api-1 backend-api-2; do
+  incus exec "$ct" -- mv /opt/veza/backend-api/veza-api.previous /opt/veza/backend-api/veza-api
+  incus exec "$ct" -- systemctl restart veza-backend-api
+done
+```
+
+The previous binary is conventionally kept at `${INSTALL_DIR}/veza-api.previous` ; the canary script does NOT copy the current binary there before overwriting (deliberate — that's a deploy-pipeline responsibility, not a per-canary responsibility).
+
+## Configuration knobs
+
+All of these are env vars — the script header is the source of truth for defaults.
+
+| Knob                          | Default                       | When to change                                        |
+| ----------------------------- | ----------------------------- | ----------------------------------------------------- |
+| `POOL_BACKEND`                | `api_pool`                    | If you renamed the HAProxy backend                    |
+| `CANARY_NODE`                 | `backend-api-2`               | Toggle which node receives the canary first           |
+| `PEER_NODES`                  | `backend-api-1`               | When the fleet grows beyond 2 nodes                   |
+| `SLI_WINDOW`                  | `3600` (1 h)                  | Shorten for hot-fixes (300 = 5 min minimum)           |
+| `SLI_PROBE_INTERVAL`          | `30` s                        | Tighter probes catch a leak faster but cost Prom load |
+| `PROM_P95_THRESHOLD_S`        | `0.5`                         | Match the SLO ; loosening it hides regressions        |
+| `PROM_ERR_RATE_THRESHOLD`     | `0.005` (0.5 %)               | Match the SLO                                         |
+| `ROLLBACK_BINARY`             | (unset)                       | Always set in a real run — auto-rollback can't work without it |
+
+## Acceptance bar (Day 23)
+
+Per `docs/ROADMAP_V1.0_LAUNCH.md` : 3 canary deploys on staging, 2 normal + 1 with a deliberate rollback (e.g. push a binary that hardcodes a 500 on `/api/v1/health`). The rollback exercise verifies the script's auto-revert path actually fires.
+
+## What this doesn't do
+
+- **Cross-LB rolls** : single haproxy assumed. When phase-2 adds keepalived + a second LB, the canary script will need a `--lb-set` arg to roll the LB pair too.
+- **Database migrations** : split-read-write migrations (e.g. dual-write during a rename) need a multi-step deploy that this script doesn't model. For now, only additive migrations are supported through the canary.
+- **Stream-server canary** : the Rust streamer follows a separate playbook (URI-hash routing means a per-track-id affinity, not a per-session affinity). Same principles apply but the script is backend-api-specific.
diff --git a/infra/ansible/group_vars/README.md b/infra/ansible/group_vars/README.md
new file mode 100644
index 000000000..08cdf1da9
--- /dev/null
+++ b/infra/ansible/group_vars/README.md
@@ -0,0 +1,67 @@
+# `group_vars/` layout
+
+Three layers, in order of precedence (later wins):
+
+1. `all/main.yml` — defaults shared across every inventory. Cross-cutting
+   values like SSH hardening, monitoring agent version, and the Veza
+   deploy contract (artifact URL, base image, ports, health probes).
+2. `<env>.yml` — environment overrides. Today: `staging.yml`, `prod.yml`
+   (and `lab.yml` would live here too if `inventory/lab.yml` ever
+   referenced an `all/lab` group). Targets that pin the Incus host,
+   container prefix, public domain, log level, feature flags.
+3. `all/vault.yml` — encrypted secrets (Ansible Vault). All entries
+   prefixed `vault_*`. Plaintext template at `all/vault.yml.example`.
+
+## Bootstrapping the vault
+
+The vault file is **not** committed at first. To stand it up:
+
+```bash
+cd infra/ansible
+cp group_vars/all/vault.yml.example group_vars/all/vault.yml
+$EDITOR group_vars/all/vault.yml          # fill in <TODO> placeholders
+ansible-vault encrypt group_vars/all/vault.yml
+echo "<your strong vault password>" > .vault-pass
+chmod 0400 .vault-pass
+```
+
+`.vault-pass` is gitignored — never commit it. The Forgejo runner
+gets the same password from the `ANSIBLE_VAULT_PASSWORD` repo secret
+(see `.forgejo/workflows/deploy.yml`).
+
+To edit later without decrypting on disk:
+
+```bash
+ansible-vault edit group_vars/all/vault.yml
+```
+
+To rotate the password (e.g., when an operator leaves):
+
+```bash
+ansible-vault rekey group_vars/all/vault.yml
+echo "<new password>" > .vault-pass
+# update Forgejo secret ANSIBLE_VAULT_PASSWORD to the new value
+```
+
+## How variables flow into containers
+
+```
+[Ansible runtime]                        [Container]
+   group_vars/all/main.yml      ┐
+   group_vars/<env>.yml         ├──→ roles/veza_app/templates/*.j2 ──→ /etc/veza/<component>.env
+   group_vars/all/vault.yml     ┘                                  ──→ /etc/veza/secrets/jwt-private.pem
+                                                                   ──→ systemd unit (EnvironmentFile=)
+```
+
+The systemd unit then reads `/etc/veza/<component>.env` at start time.
+Reload semantics: a config change re-templates the env file and
+notifies the systemd handler, which restarts the unit.
+
+## What lives in `host_vars/`?
+
+`host_vars/<host>.yml` for **per-host** overrides — typically when one
+container in an HA group needs a slightly different config (e.g., the
+postgres-primary needs `pg_auto_failover_role: node`, the monitor
+needs `pg_auto_failover_role: monitor`). The lab inventory inlines
+these as host-level vars; `host_vars/` exists for cases where they
+shouldn't bloat the inventory file.
diff --git a/infra/ansible/group_vars/all.yml b/infra/ansible/group_vars/all.yml
deleted file mode 100644
index 7cdd4be5e..000000000
--- a/infra/ansible/group_vars/all.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Shared defaults across every inventory (lab/staging/prod). Override
-# per-environment in `group_vars/<group>.yml` or per-host in
-# `host_vars/<host>.yml`.
----
-# Owner contact (used in some unattended-upgrades + monitoring agent configs).
-veza_ops_email: ops@veza.fr
-
-# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces.
-# Override these in production via group_vars/veza_prod.yml when the
-# bastion's specific port / allowed users are decided. Defaults are
-# safe for lab.
-ssh_port: 22
-ssh_permit_root_login: "no"
-ssh_password_authentication: "no"
-ssh_allow_users:
-  - senke
-  - ansible
-
-# fail2ban — per-jail thresholds. The defaults are conservative for
-# a self-hosted single-machine deployment; production may want
-# lower findtime / higher bantime once Forgejo + Veza traffic is
-# baselined.
-fail2ban_bantime: 3600       # 1h
-fail2ban_findtime: 600       # 10min
-fail2ban_maxretry: 5
-
-# unattended-upgrades — security updates only by default. The role
-# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins
-# downtime windows to controlled cycles, not OS-driven reboots.
-unattended_upgrades_origins:
-  - "${distro_id}:${distro_codename}-security"
-  - "${distro_id}ESMApps:${distro_codename}-apps-security"
-  - "${distro_id}ESM:${distro_codename}-infra-security"
-unattended_upgrades_auto_reboot: false
-
-# Monitoring agent: prometheus node_exporter is the bare-minimum
-# host metrics surface (CPU / memory / disk / network). The
-# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap.
-monitoring_node_exporter_version: "1.8.2"
-monitoring_node_exporter_port: 9100
diff --git a/infra/ansible/group_vars/all/vault.yml.example b/infra/ansible/group_vars/all/vault.yml.example
new file mode 100644
index 000000000..1be1fca36
--- /dev/null
+++ b/infra/ansible/group_vars/all/vault.yml.example
@@ -0,0 +1,78 @@
+# Template for group_vars/all/vault.yml — the encrypted secrets store
+# consumed by every playbook. Copy this file to vault.yml, fill in real
+# values, then encrypt:
+#
+#   cp vault.yml.example vault.yml
+#   $EDITOR vault.yml                                # fill in real values
+#   ansible-vault encrypt vault.yml                  # in place
+#   echo "<your strong password>" > ../../../.vault-pass    # gitignored
+#   chmod 0400 ../../../.vault-pass
+#
+# After that, every `ansible-playbook` invocation needs:
+#   ansible-playbook --vault-password-file infra/ansible/.vault-pass ...
+# The Forgejo deploy workflow handles this via the ANSIBLE_VAULT_PASSWORD
+# repo secret (see .forgejo/workflows/deploy.yml).
+#
+# Naming: every secret is prefixed `vault_*` so it's grep-able and so
+# `group_vars/all/main.yml` references like `postgres_password:
+# "{{ vault_postgres_password }}"` are unambiguous.
+---
+# --- Database -----------------------------------------------------------
+vault_postgres_password: "<TODO: 32+ char strong password for veza role>"
+vault_postgres_replication_password: "<TODO: separate password for replication user>"
+
+# --- Cache / queue ------------------------------------------------------
+vault_redis_password: "<TODO>"
+vault_rabbitmq_password: "<TODO>"
+
+# --- Object storage (MinIO) ---------------------------------------------
+vault_minio_root_user: "<TODO: only used to bootstrap the cluster>"
+vault_minio_root_password: "<TODO: 16+ chars, MinIO refuses shorter>"
+vault_minio_access_key: "<TODO: app-tier access key>"
+vault_minio_secret_key: "<TODO: app-tier secret key>"
+
+# --- JWT ----------------------------------------------------------------
+# Backend prefers RS256 in prod. Generate with:
+#   openssl genrsa -out jwt-private.pem 4096
+#   openssl rsa -in jwt-private.pem -pubout -out jwt-public.pem
+# Then base64 each:
+#   base64 -w0 jwt-private.pem
+#   base64 -w0 jwt-public.pem
+vault_jwt_signing_key_b64: "<TODO: base64 of RS256 private PEM>"
+vault_jwt_public_key_b64: "<TODO: base64 of RS256 public PEM>"
+
+# Chat WebSocket signs its own short-lived tokens — must differ from the
+# main JWT secret in production (defense in depth).
+vault_chat_jwt_secret: "<TODO: 32+ chars, distinct from JWT signing key>"
+
+# --- App-internal API keys ---------------------------------------------
+# Backend ↔ stream-server shared secret. Both services must have the
+# same value so /api/v1/internal/* requests authenticate.
+vault_stream_internal_api_key: "<TODO: 32+ chars>"
+
+# OAuth refresh tokens are encrypted at rest with this key.
+vault_oauth_encryption_key: "<TODO: exactly 32 bytes, raw or hex>"
+
+# --- Email --------------------------------------------------------------
+vault_smtp_password: "<TODO>"
+
+# --- Payments -----------------------------------------------------------
+# Hyperswitch routes through Stripe Connect. Both keys are required if
+# `HYPERSWITCH_ENABLED=true` in group_vars/<env>.yml.
+vault_hyperswitch_api_key: "<TODO>"
+vault_hyperswitch_webhook_secret: "<TODO>"
+vault_stripe_secret_key: "<TODO: sk_live_… in prod, sk_test_… in staging>"
+
+# --- OAuth providers ----------------------------------------------------
+# Add only the providers you actually enable; keys consumed by
+# templates/backend.env.j2 conditionally on truthiness.
+vault_oauth_clients:
+  google:
+    id: "<TODO>"
+    secret: "<TODO>"
+  spotify:
+    id: "<TODO>"
+    secret: "<TODO>"
+
+# --- Sentry / observability --------------------------------------------
+vault_sentry_dsn: "<TODO: empty string disables Sentry>"
diff --git a/infra/ansible/group_vars/prod.yml b/infra/ansible/group_vars/prod.yml
new file mode 100644
index 000000000..15ff3b1f4
--- /dev/null
+++ b/infra/ansible/group_vars/prod.yml
@@ -0,0 +1,42 @@
+# Prod-specific overrides. Same R720 host as staging in v1.0; separate
+# Incus network + container prefix prevents staging/prod from sharing
+# any state. Phase-2 (post v1.1) is expected to move prod to a
+# dedicated host, at which point only `veza_incus_host` flips.
+---
+veza_env: prod
+veza_release_channel: prod
+
+veza_incus_host: veza-prod
+veza_incus_network: veza-net
+veza_incus_subnet: 10.0.20.0/24
+
+veza_container_prefix: "veza-"  # production uses unprefixed names — the established convention
+
+veza_incus_dns_suffix: lxd
+
+haproxy_topology: blue-green
+
+veza_public_host: veza.fr
+veza_public_url: "https://veza.fr"
+veza_cors_allowed_origins:
+  - "https://veza.fr"
+  - "https://app.veza.fr"
+
+# Prod is INFO so 99th-percentile log volume stays manageable. Bump to
+# DEBUG for a window via `ansible-playbook -e veza_log_level=DEBUG` if
+# triaging an incident.
+veza_log_level: INFO
+veza_otel_sample_rate: "0.05"
+
+veza_feature_flags:
+  HYPERSWITCH_ENABLED: "true"
+  STRIPE_CONNECT_ENABLED: "true"
+  WEBAUTHN_ENABLED: "true"
+
+# Larger retention than staging — prod rollback may need to reach a
+# release from up to a month ago when the cause was latent.
+veza_release_retention: 60
+
+postgres_password: "{{ vault_postgres_password }}"
+redis_password: "{{ vault_redis_password }}"
+rabbitmq_password: "{{ vault_rabbitmq_password }}"
diff --git a/infra/ansible/group_vars/staging.yml b/infra/ansible/group_vars/staging.yml
new file mode 100644
index 000000000..43841b0ab
--- /dev/null
+++ b/infra/ansible/group_vars/staging.yml
@@ -0,0 +1,67 @@
+# Staging-specific overrides. Targets the local R720 Incus daemon (the
+# same host the Forgejo runner lives on). Containers prefixed `veza-*`
+# share the `veza-net` Incus bridge (10.0.20.0/24).
+#
+# Phase-1 simplification: staging and prod coexist on the same R720 but
+# on separate Incus networks (veza-staging-net 10.0.21.0/24 vs
+# veza-prod-net 10.0.20.0/24) and separate container name prefixes
+# (veza-staging-* vs veza-prod-*). When prod migrates off-box (Hetzner
+# or similar), this file's `veza_incus_host` flips to that target.
+---
+veza_env: staging
+veza_release_channel: staging
+
+# Where the Incus daemon lives. Used by the deploy workflow to decide
+# which inventory host's `community.general.incus` connection plugin
+# to drive containers from.
+veza_incus_host: veza-staging
+veza_incus_network: veza-staging-net
+veza_incus_subnet: 10.0.21.0/24
+
+# Container name prefix — every app/data container ends up named
+# `<veza_container_prefix><component>[-<color>]`. e.g.
+# veza-staging-backend-blue, veza-staging-postgres.
+veza_container_prefix: "veza-staging-"
+
+# DNS suffix Incus assigns to managed containers. The HAProxy template
+# resolves backends as `<container>.<suffix>`. Default `.lxd` works
+# with the stock Incus DNS resolver; override if you've renamed the
+# managed network's DNS zone.
+veza_incus_dns_suffix: lxd
+
+# HAProxy strategy for the staging stack: blue/green, two app
+# containers per component (active + standby). Differs from the lab
+# inventory which uses an active/active multi-instance pattern.
+haproxy_topology: blue-green
+
+# Public-facing URLs — used by backend for OAuth redirects, email
+# links, CSP origins, and by HAProxy ACLs.
+veza_public_host: staging.veza.fr
+veza_public_url: "https://staging.veza.fr"
+veza_cors_allowed_origins:
+  - "https://staging.veza.fr"
+  - "https://staging-app.veza.fr"
+
+# Logging — staging keeps DEBUG to make incident triage easy. Prod
+# drops to INFO. Tracing sample rate stays at 100% in staging
+# (low traffic) and 5% in prod (cost).
+veza_log_level: DEBUG
+veza_otel_sample_rate: "1.0"
+
+# Feature flags exposed to the backend at boot. Keep this list small —
+# the backend's own .env.template is the canonical reference.
+veza_feature_flags:
+  HYPERSWITCH_ENABLED: "false"
+  STRIPE_CONNECT_ENABLED: "false"
+  WEBAUTHN_ENABLED: "true"
+
+# How many recent release SHAs the rollback workflow can target. Older
+# tarballs are pruned by the Forgejo registry retention policy (set
+# externally). 30 deploys ≈ a working week given the staging cadence.
+veza_release_retention: 30
+
+# Postgres password the migrations job uses — references vault.yml so
+# rotation is one ansible-vault edit + one redeploy.
+postgres_password: "{{ vault_postgres_password }}"
+redis_password: "{{ vault_redis_password }}"
+rabbitmq_password: "{{ vault_rabbitmq_password }}"
diff --git a/make/incus.mk b/make/incus.mk
index c20abbdb9..d0968629f 100644
--- a/make/incus.mk
+++ b/make/incus.mk
@@ -198,3 +198,22 @@ incus-logs: ## [LOW] Show logs from Incus container (usage: make incus-logs SERV
 		exit 1; \
 	fi
 	@incus exec veza-$(SERVICE) -- journalctl -f
+
+# ==============================================================================
+# CANARY RELEASE (W5 Day 23)
+# ==============================================================================
+
+.PHONY: deploy-canary
+
+deploy-canary: ## [HIGH] Canary release : drain → deploy → SLI monitor → rollback on red. ARTIFACT=/path required. See docs/CANARY_RELEASE.md.
+	@if [ -z "$(ARTIFACT)" ]; then \
+		$(ECHO_CMD) "${RED}❌ ARTIFACT=/path/to/veza-api required${NC}"; \
+		$(ECHO_CMD) "${YELLOW}   See docs/CANARY_RELEASE.md for the full env-var surface.${NC}"; \
+		exit 1; \
+	fi
+	@$(ECHO_CMD) "${BLUE}🚦 Canary deploy : $(ARTIFACT)${NC}"
+	@ARTIFACT="$(ARTIFACT)" \
+	 ROLLBACK_BINARY="$(ROLLBACK_BINARY)" \
+	 SLI_WINDOW="$(SLI_WINDOW)" \
+	 PROM_URL="$(PROM_URL)" \
+	 bash $(CURDIR)/scripts/deploy-canary.sh
diff --git a/scripts/check-migration-backward-compat.sh b/scripts/check-migration-backward-compat.sh
new file mode 100755
index 000000000..a2f53239c
--- /dev/null
+++ b/scripts/check-migration-backward-compat.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# check-migration-backward-compat.sh — pre-deploy gate for canary releases.
+#
+# Refuses to deploy when the latest migration is NOT backward-compatible
+# with the running schema. Backward-compat = the OLD code can still
+# read/write against the NEW schema for at least one canary window
+# (otherwise canary mode is meaningless ; the old node would crash on
+# the first request that touches a removed column).
+#
+# Heuristic : reject migrations that contain any of these patterns :
+#   - DROP COLUMN
+#   - DROP TABLE
+#   - ALTER COLUMN ... TYPE      (type change is rarely backward-compat)
+#   - ADD COLUMN ... NOT NULL    (without DEFAULT — old code can't INSERT)
+#   - DROP CONSTRAINT
+#   - DROP INDEX UNIQUE          (existing data may already violate)
+#
+# This is a STATIC check ; some patterns are false-positives (e.g.
+# DROP COLUMN of a column that no code reads). When a real migration
+# is flagged, the operator either :
+#   1. Splits the migration : ship the additive part now, drop in v+1
+#      after old-version backends are decommissioned.
+#   2. Bypasses with FORCE_MIGRATE=1 + a justification in the commit
+#      message of the migration file.
+#
+# v1.0.9 W5 Day 23.
+#
+# Usage :
+#   bash scripts/check-migration-backward-compat.sh
+#
+# Required env :
+#   MIGRATIONS_DIR  default veza-backend-api/migrations
+#   GIT_RANGE       default origin/main..HEAD ; the range to inspect for
+#                   newly-added migration files
+# Optional env :
+#   FORCE_MIGRATE=1 bypass with a logged warning. Use sparingly.
+#
+# Exit codes :
+#   0  — all new migrations are backward-compat (or FORCE_MIGRATE=1)
+#   1  — at least one migration carries a forbidden pattern
+#   3  — required tool missing / config error
+set -euo pipefail
+
+MIGRATIONS_DIR=${MIGRATIONS_DIR:-veza-backend-api/migrations}
+GIT_RANGE=${GIT_RANGE:-origin/main..HEAD}
+FORCE_MIGRATE=${FORCE_MIGRATE:-0}
+
+log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+fail() { log "FAIL: $*"; exit "${2:-1}"; }
+
+require() {
+  command -v "$1" >/dev/null 2>&1 || fail "required tool missing: $1" 3
+}
+
+require git
+require grep
+require date
+
+# Patterns that indicate non-backward-compat schema change.
+# Heredoc preserves the pipe characters as alternations.
+FORBIDDEN_PATTERNS='DROP COLUMN|DROP TABLE|ALTER COLUMN [A-Za-z_]+ TYPE|ADD COLUMN [A-Za-z_]+ [^,;]* NOT NULL[^,;]*(;|$)|DROP CONSTRAINT|DROP INDEX [A-Za-z_]*UNIQUE'
+
+# Identify newly-added migration files in the current range.
+new_migrations=$(git diff --name-only --diff-filter=A "$GIT_RANGE" -- "$MIGRATIONS_DIR" 2>/dev/null \
+  | grep -E "^${MIGRATIONS_DIR}/[0-9]+_.*\.sql$" || true)
+
+if [ -z "$new_migrations" ]; then
+  log "no new migrations in $GIT_RANGE — nothing to check"
+  exit 0
+fi
+
+log "checking $(echo "$new_migrations" | wc -l) new migration(s) in $GIT_RANGE"
+findings=0
+for f in $new_migrations; do
+  log "  scanning $f"
+  # -i case-insensitive ; -E extended regex ; -n line numbers
+  matches=$(grep -inE "$FORBIDDEN_PATTERNS" "$f" || true)
+  if [ -n "$matches" ]; then
+    findings=$((findings + 1))
+    log ""
+    log "  ⚠ NON-BACKWARD-COMPAT pattern in $f :"
+    echo "$matches" | sed 's/^/      /' >&2
+    # Special case : ADD COLUMN ... NOT NULL ... DEFAULT <x> is fine.
+    # The regex above tries to exclude that but the match-then-filter
+    # approach is more reliable than a single regex. Suppress matches
+    # that include `DEFAULT` on the same line.
+    real=$(echo "$matches" | grep -ivE "DEFAULT" || true)
+    if [ -z "$real" ]; then
+      log "      ↳ all matches include DEFAULT clause — actually backward-compat"
+      findings=$((findings - 1))
+    fi
+  fi
+done
+
+if [ "$findings" -gt 0 ]; then
+  log ""
+  log "$findings migration(s) flagged as potentially non-backward-compat."
+  if [ "$FORCE_MIGRATE" = "1" ]; then
+    log "FORCE_MIGRATE=1 set — proceeding anyway."
+    exit 0
+  fi
+  log ""
+  log "Options to proceed :"
+  log "  1. Split the migration : ship the additive part now, drop the"
+  log "     non-compat part in v+1 after old backends are off."
+  log "  2. Set FORCE_MIGRATE=1 if you accept the risk + document the"
+  log "     justification in the migration's commit message."
+  exit 1
+fi
+
+log "PASS : all new migrations are backward-compat"
+exit 0
diff --git a/scripts/deploy-canary.sh b/scripts/deploy-canary.sh
new file mode 100755
index 000000000..f3d42a1d7
--- /dev/null
+++ b/scripts/deploy-canary.sh
@@ -0,0 +1,287 @@
+#!/usr/bin/env bash
+# deploy-canary.sh — canary release for the active/active backend-api fleet.
+#
+# Walks the standard canary recipe (drain → deploy → health → re-enable
+# → SLI monitor → repeat or rollback) end-to-end. Designed to run on
+# the host that owns the backend-api Incus containers + the haproxy
+# admin socket.
+#
+# v1.0.9 W5 Day 23.
+#
+# Usage :
+#   bash scripts/deploy-canary.sh /path/to/new/veza-api
+#
+# Required tools : incus, curl, socat (HAProxy admin socket), bash 4+.
+#
+# Required env :
+#   ARTIFACT          path to the new veza-api binary (passed as $1 too)
+# Optional env :
+#   POOL_BACKEND      HAProxy backend name (default api_pool)
+#   CANARY_NODE       which container to canary first (default backend-api-2)
+#   PEER_NODES        comma-separated list of peers to roll AFTER canary
+#                     succeeds (default backend-api-1)
+#   HEALTH_HOST       host to curl (default haproxy.lxd ; LB-routed)
+#   HEALTH_PATH       default /api/v1/health
+#   SLI_WINDOW        SLI monitor duration in seconds (default 3600 = 1h)
+#   SLI_PROBE_INTERVAL  seconds between SLI probes (default 30)
+#   PROM_URL          Prometheus query URL (default http://prom.lxd:9090)
+#   PROM_P95_THRESHOLD_S    p95 SLI threshold in seconds (default 0.5)
+#   PROM_ERR_RATE_THRESHOLD error rate threshold (default 0.005 = 0.5%)
+#   ROLLBACK_BINARY   path to the previous-known-good binary (used on red)
+#                     If unset, rollback skips the binary swap and just
+#                     re-enables the canary node — operator handles the
+#                     real revert.
+#   PRE_DEPLOY_HOOK   path to script that validates migrations are
+#                     backward-compat. Defaults to scripts/check-migration-backward-compat.sh
+#                     when present.
+#
+# Exit codes :
+#   0  — canary + full roll succeeded
+#   1  — pre-deploy validation failed ; nothing was changed
+#   2  — canary failed ; rollback executed
+#   3  — required tool / env missing
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+
+ARTIFACT=${ARTIFACT:-${1:-?}}
+POOL_BACKEND=${POOL_BACKEND:-api_pool}
+CANARY_NODE=${CANARY_NODE:-backend-api-2}
+PEER_NODES=${PEER_NODES:-backend-api-1}
+HEALTH_HOST=${HEALTH_HOST:-haproxy.lxd}
+HEALTH_PATH=${HEALTH_PATH:-/api/v1/health}
+SLI_WINDOW=${SLI_WINDOW:-3600}
+SLI_PROBE_INTERVAL=${SLI_PROBE_INTERVAL:-30}
+PROM_URL=${PROM_URL:-http://prom.lxd:9090}
+PROM_P95_THRESHOLD_S=${PROM_P95_THRESHOLD_S:-0.5}
+PROM_ERR_RATE_THRESHOLD=${PROM_ERR_RATE_THRESHOLD:-0.005}
+ROLLBACK_BINARY=${ROLLBACK_BINARY:-}
+PRE_DEPLOY_HOOK=${PRE_DEPLOY_HOOK:-${REPO_ROOT}/scripts/check-migration-backward-compat.sh}
+
+log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+die()  { log "FAIL: $*"; exit "${2:-1}"; }
+
+require() {
+  command -v "$1" >/dev/null 2>&1 || die "required tool missing: $1" 3
+}
+
+require incus
+require curl
+require socat
+require date
+
+if [ "$ARTIFACT" = "?" ] || [ ! -f "$ARTIFACT" ]; then
+  die "ARTIFACT (or \$1) must point to an existing binary" 1
+fi
+
+# --------------------------------------------------------------------
+# Helpers : HAProxy admin socket commands.
+# --------------------------------------------------------------------
+HAPROXY_CONTAINER=${HAPROXY_CONTAINER:-haproxy}
+
+ha_cmd() {
+  incus exec "$HAPROXY_CONTAINER" -- bash -c "echo '$1' | socat /run/haproxy/admin.sock -"
+}
+
+ha_state() {
+  local node=$1
+  ha_cmd "show servers state $POOL_BACKEND" \
+    | awk -v n="$node" '$0 ~ n {print $7}' | head -1
+  # field 7 in `show servers state` is operational_state (0=stop, 1=run, 2=ready/drain)
+}
+
+ha_drain() {
+  log "haproxy : drain $1"
+  ha_cmd "set server ${POOL_BACKEND}/${1} state drain" >/dev/null
+}
+
+ha_ready() {
+  log "haproxy : ready $1"
+  ha_cmd "set server ${POOL_BACKEND}/${1} state ready" >/dev/null
+}
+
+ha_wait_drained() {
+  # Drain finishes when the server reports 0 active connections.
+  local node=$1
+  local deadline=$(( $(date +%s) + 60 ))
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    local n
+    n=$(ha_cmd "show stat" | awk -F, -v s="$node" '$2 == s {print $5; exit}' 2>/dev/null || echo 0)
+    if [ "${n:-0}" = "0" ]; then
+      log "haproxy : $node drained (0 active connections)"
+      return 0
+    fi
+    sleep 2
+  done
+  log "WARN : $node still has active connections after 60s drain ; proceeding anyway"
+}
+
+curl_health() {
+  curl --max-time 5 -sS -o /dev/null -w "%{http_code}" \
+    "http://${HEALTH_HOST}${HEALTH_PATH}" 2>/dev/null || echo "000"
+}
+
+# --------------------------------------------------------------------
+# SLI monitor — query Prometheus over the SLI_WINDOW. Fails as soon as
+# any probe reports red so we can rollback fast.
+# --------------------------------------------------------------------
+prom_query() {
+  local q=$1
+  curl --max-time 10 -sS -G --data-urlencode "query=${q}" \
+    "${PROM_URL}/api/v1/query" 2>/dev/null \
+    | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null || echo 0
+}
+
+monitor_sli() {
+  log "monitoring SLI for ${SLI_WINDOW}s (probes every ${SLI_PROBE_INTERVAL}s)"
+  local deadline=$(( $(date +%s) + SLI_WINDOW ))
+  local probes=0
+  local first_red=""
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    probes=$((probes + 1))
+    local p95 err
+    p95=$(prom_query 'histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le))')
+    err=$(prom_query 'sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))')
+    log "  probe $probes : p95=${p95}s err=${err}"
+
+    # awk used for float comparison ; bash test only does integers.
+    if awk -v a="$p95" -v b="$PROM_P95_THRESHOLD_S" 'BEGIN{exit !(a > b)}'; then
+      first_red="p95 ${p95}s > threshold ${PROM_P95_THRESHOLD_S}s"
+      break
+    fi
+    if awk -v a="$err" -v b="$PROM_ERR_RATE_THRESHOLD" 'BEGIN{exit !(a > b)}'; then
+      first_red="error rate ${err} > threshold ${PROM_ERR_RATE_THRESHOLD}"
+      break
+    fi
+    sleep "$SLI_PROBE_INTERVAL"
+  done
+  if [ -n "$first_red" ]; then
+    log "SLI red after $probes probe(s) : $first_red"
+    return 1
+  fi
+  log "SLI green for the full ${SLI_WINDOW}s window ($probes probes)"
+  return 0
+}
+
+# --------------------------------------------------------------------
+# Deploy + rollback primitives.
+# --------------------------------------------------------------------
+deploy_to() {
+  local node=$1
+  local artifact=$2
+  log "deploying $artifact → $node"
+  incus file push "$artifact" "$node/opt/veza/backend-api/veza-api" \
+    --uid 1001 --gid 1001 --mode 0755
+  incus exec "$node" -- systemctl restart veza-backend-api
+}
+
+verify_node_health() {
+  local node=$1
+  log "node health check : $node"
+  local deadline=$(( $(date +%s) + 60 ))
+  while [ "$(date +%s)" -lt "$deadline" ]; do
+    if incus exec "$node" -- curl --max-time 3 -sSf http://127.0.0.1:8080${HEALTH_PATH} >/dev/null 2>&1; then
+      log "  $node : 200"
+      return 0
+    fi
+    sleep 2
+  done
+  return 1
+}
+
+rollback_canary() {
+  log "ROLLBACK : restoring $CANARY_NODE"
+  if [ -n "$ROLLBACK_BINARY" ] && [ -f "$ROLLBACK_BINARY" ]; then
+    deploy_to "$CANARY_NODE" "$ROLLBACK_BINARY" || true
+    verify_node_health "$CANARY_NODE" || log "rollback : node health check still failing"
+  else
+    log "ROLLBACK_BINARY not set — leaving binary in place ; operator must finish revert"
+  fi
+  ha_ready "$CANARY_NODE"
+}
+
+# --------------------------------------------------------------------
+# 1. Pre-deploy hook (migration backward-compat).
+# --------------------------------------------------------------------
+log "step 1 : pre-deploy hook"
+if [ -x "$PRE_DEPLOY_HOOK" ]; then
+  if ! "$PRE_DEPLOY_HOOK"; then
+    die "pre-deploy hook ($PRE_DEPLOY_HOOK) reported a backward-incompat migration ; aborting" 1
+  fi
+else
+  log "  PRE_DEPLOY_HOOK ($PRE_DEPLOY_HOOK) not executable ; skipping (no-op)"
+fi
+
+# --------------------------------------------------------------------
+# 2. Drain canary node.
+# --------------------------------------------------------------------
+log "step 2 : drain $CANARY_NODE in HAProxy"
+ha_drain "$CANARY_NODE"
+ha_wait_drained "$CANARY_NODE"
+
+# --------------------------------------------------------------------
+# 3. Deploy artifact to the canary node.
+# --------------------------------------------------------------------
+log "step 3 : deploy artifact to $CANARY_NODE"
+deploy_to "$CANARY_NODE" "$ARTIFACT"
+
+# --------------------------------------------------------------------
+# 4. Per-node health check.
+# --------------------------------------------------------------------
+log "step 4 : health check on $CANARY_NODE"
+if ! verify_node_health "$CANARY_NODE"; then
+  log "$CANARY_NODE failed health check post-deploy"
+  rollback_canary
+  exit 2
+fi
+
+# --------------------------------------------------------------------
+# 5. Re-enable + LB health check (proves HAProxy sees the node ready).
+# --------------------------------------------------------------------
+log "step 5 : re-enable $CANARY_NODE in HAProxy"
+ha_ready "$CANARY_NODE"
+sleep 5
+lb_status=$(curl_health)
+if [ "$lb_status" != "200" ]; then
+  log "LB health check after re-enable returned $lb_status ; rolling back"
+  rollback_canary
+  exit 2
+fi
+
+# --------------------------------------------------------------------
+# 6. SLI monitor — kept the canary alive ; if SLO breaches, rollback.
+# --------------------------------------------------------------------
+log "step 6 : monitor SLI on the canary"
+if ! monitor_sli; then
+  log "SLI red — rolling back the canary"
+  rollback_canary
+  exit 2
+fi
+
+# --------------------------------------------------------------------
+# 7. SLI green — repeat on each peer.
+# --------------------------------------------------------------------
+log "step 7 : SLI green on canary, rolling peers : $PEER_NODES"
+IFS=',' read -ra peers <<< "$PEER_NODES"
+for peer in "${peers[@]}"; do
+  log "── peer $peer ───────────────────────────"
+  ha_drain "$peer"
+  ha_wait_drained "$peer"
+  deploy_to "$peer" "$ARTIFACT"
+  if ! verify_node_health "$peer"; then
+    log "$peer health check failed post-deploy"
+    log "WARN : leaving $peer drained ; canary node still serves traffic"
+    log "       operator must re-deploy known-good binary or repair $peer manually"
+    exit 2
+  fi
+  ha_ready "$peer"
+  sleep 5
+  lb_status=$(curl_health)
+  if [ "$lb_status" != "200" ]; then
+    log "LB health check after re-enable of $peer returned $lb_status — abandoning roll"
+    exit 2
+  fi
+done
+
+log "PASS : canary $CANARY_NODE + peers $PEER_NODES deployed cleanly"
+exit 0