diff --git a/config/grafana/dashboards/redis-cache-overview.json b/config/grafana/dashboards/redis-cache-overview.json new file mode 100644 index 000000000..11a96b2c9 --- /dev/null +++ b/config/grafana/dashboards/redis-cache-overview.json @@ -0,0 +1,102 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { "unit": "percentunit", "min": 0, "max": 1, "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.9 }, { "color": "green", "value": 0.99 }] } }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 0 }, + "id": 1, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto" }, + "targets": [ + { + "expr": "sum(rate(veza_cache_hits_total{subsystem=\"rate_limiter\"}[5m])) / (sum(rate(veza_cache_hits_total{subsystem=\"rate_limiter\"}[5m])) + sum(rate(veza_cache_misses_total{subsystem=\"rate_limiter\"}[5m])))", + "refId": "A" + } + ], + "title": "Rate limiter — cache hit rate (5m)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { "unit": "percentunit", "min": 0, "max": 1, "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.9 }, { "color": "green", "value": 0.99 }] } }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 8, "y": 0 }, + "id": 2, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto" }, + "targets": [ + { + "expr": "sum(rate(veza_cache_hits_total{subsystem=\"chat_pubsub\"}[5m])) / (sum(rate(veza_cache_hits_total{subsystem=\"chat_pubsub\"}[5m])) + sum(rate(veza_cache_misses_total{subsystem=\"chat_pubsub\"}[5m])))", + "refId": "A" + } + ], + "title": "Chat PubSub — hit rate (5m)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { "unit": "percentunit", "min": 0, "max": 1, "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.9 }, { "color": "green", "value": 0.99 }] } }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 0 }, + "id": 3, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto" }, + "targets": [ + { + "expr": "sum(rate(veza_cache_hits_total{subsystem=\"presence\"}[5m])) / (sum(rate(veza_cache_hits_total{subsystem=\"presence\"}[5m])) + sum(rate(veza_cache_misses_total{subsystem=\"presence\"}[5m])))", + "refId": "A" + } + ], + "title": "Presence — hit rate (5m)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "ops", "color": { "mode": "palette-classic" } }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 4, + "options": { "legend": { "displayMode": "list", "placement": "bottom" } }, + "targets": [ + { "expr": "sum by (subsystem) (rate(veza_cache_hits_total[5m]))", "legendFormat": "{{subsystem}} hits", "refId": "A" }, + { "expr": "sum by (subsystem) (rate(veza_cache_misses_total[5m]))", "legendFormat": "{{subsystem}} misses", "refId": "B" } + ], + "title": "Hits + misses per subsystem (ops/s)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 5, + "options": { "legend": { "displayMode": "list", "placement": "bottom" } }, + "targets": [ + { "expr": "redis_connected_clients", "legendFormat": "{{instance}} clients", "refId": "A" }, + { "expr": "redis_connected_slaves", "legendFormat": "{{instance}} replicas", "refId": "B" } + ], + "title": "Redis connectivity (requires redis_exporter)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["veza", "redis", "cache"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Veza Redis + Cache Hit Rate", + "uid": "veza-redis-cache", + "version": 1 +} diff --git a/docs/ENV_VARIABLES.md b/docs/ENV_VARIABLES.md index 615d2a12a..3d16f491b 100644 --- a/docs/ENV_VARIABLES.md +++ b/docs/ENV_VARIABLES.md @@ -98,6 +98,9 @@ Hard requirement : `DATABASE_URL`. Pool par défaut tuné pour un dev mono-pod ; | --- | --- | --- | --- | | **`REDIS_URL`** | `redis://veza.fr:6379` | `config.go:338` | URL complète. **Doit être explicite en prod** (`validation.go:916`) — pas de fallback qui casserait multi-pod. | | `REDIS_ENABLE` | `true` | `config.go:339` | Désactiver Redis désactive CSRF, rate-limit, cache. | +| `REDIS_SENTINEL_ADDRS` | (none) | `redis_init.go` | v1.0.9 W3 Day 11. CSV de `host:port` Sentinels (ex. `redis-1.lxd:26379,redis-2.lxd:26379,redis-3.lxd:26379`). Si non vide, le backend utilise `redis.NewFailoverClient` au lieu d'un client direct ; `REDIS_URL` ne sert plus qu'à passer le password + DB index. | +| `REDIS_SENTINEL_MASTER_NAME` | `veza-master` | `redis_init.go` | Doit matcher la directive `sentinel monitor ...` côté Sentinel. | +| `REDIS_SENTINEL_PASSWORD` | (vide) | `redis_init.go` | Auth Sentinel-to-Sentinel (séparée de `REDIS_PASSWORD` pour limiter le blast radius). | `REDIS_ADDR`, `REDIS_PASSWORD`, `REDIS_DB` apparaissent encore dans le template mais **ne sont plus lus** — utiliser `REDIS_URL`. Voir [§27](#27-variables-dépréciées--legacy). diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml index 7f3de96d5..ef26586fb 100644 --- a/infra/ansible/inventory/lab.yml +++ b/infra/ansible/inventory/lab.yml @@ -56,6 +56,22 @@ all: vars: ansible_connection: community.general.incus ansible_python_interpreter: /usr/bin/python3 + # v1.0.9 W3 Day 11: Redis Sentinel HA. 3 Incus containers each + # running a redis-server + redis-sentinel; redis-1 boots as master, + # the other two as replicas. Sentinel quorum = 2 across the 3. + redis_ha: + hosts: + redis-1: + redis-2: + redis-3: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + redis_ha_master: + # First in this list is the bootstrap master ; sentinel.conf.j2 + # references this group to point each sentinel at it. + hosts: + redis-1: # v1.0.9 Day 9: otel-collector + Tempo for distributed tracing. # Each runs in its own Incus container; the API on the host points # at otel-collector.lxd:4317 via OTEL_EXPORTER_OTLP_ENDPOINT. diff --git a/infra/ansible/playbooks/redis_sentinel.yml b/infra/ansible/playbooks/redis_sentinel.yml new file mode 100644 index 000000000..652e7a918 --- /dev/null +++ b/infra/ansible/playbooks/redis_sentinel.yml @@ -0,0 +1,56 @@ +# Redis Sentinel HA playbook — provisions 3 Incus containers +# (redis-1 / redis-2 / redis-3) and lays down redis + sentinel on +# each. v1.0.9 W3 Day 11. +# +# Run with: +# ansible-playbook -i inventory/lab.yml playbooks/redis_sentinel.yml --check +# ansible-playbook -i inventory/lab.yml playbooks/redis_sentinel.yml \ +# --extra-vars '{"redis_password":"...","redis_sentinel_password":"..."}' +# +# In prod / staging the secrets come from the encrypted vault +# (group_vars/redis_ha.vault.yml). +--- +- name: Provision Incus containers for the Redis formation + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Launch redis-1 + redis-2 + redis-3 + ansible.builtin.shell: + cmd: | + set -e + for ct in redis-1 redis-2 redis-3; do + if ! incus info "$ct" >/dev/null 2>&1; then + incus launch images:ubuntu/22.04 "$ct" + for _ in $(seq 1 30); do + if incus exec "$ct" -- cloud-init status 2>/dev/null | grep -q "status: done"; then + break + fi + sleep 1 + done + incus exec "$ct" -- apt-get update + incus exec "$ct" -- apt-get install -y python3 python3-apt + fi + done + args: + executable: /bin/bash + register: provision_result + changed_when: "'incus launch' in provision_result.stdout" + tags: [redis_sentinel, provision] + + - name: Refresh inventory so the new containers are reachable + ansible.builtin.meta: refresh_inventory + +- name: Apply common baseline to the redis formation + hosts: redis_ha + become: true + gather_facts: true + roles: + - common + +- name: Install + configure Redis + Sentinel on every node + hosts: redis_ha + become: true + gather_facts: true + roles: + - redis_sentinel diff --git a/infra/ansible/roles/redis_sentinel/README.md b/infra/ansible/roles/redis_sentinel/README.md new file mode 100644 index 000000000..1c910bb49 --- /dev/null +++ b/infra/ansible/roles/redis_sentinel/README.md @@ -0,0 +1,104 @@ +# `redis_sentinel` role — Redis 7 + Sentinel HA formation + +Three Incus containers, one Redis + one Sentinel co-located per container. At first boot `redis-1` is master, `redis-2` and `redis-3` are replicas. The 3 sentinels (quorum 2) handle failover when the master dies — promotion is bounded at 30s by `failover-timeout`. + +## Topology + +``` + ┌─────────────┐ + │ redis-1 │ master at first boot + │ • redis │ + │ • sentinel │ + └──────┬──────┘ + │ replication + ┌────────────┴────────────┐ + ▼ ▼ + ┌─────────────┐ ┌─────────────┐ + │ redis-2 │ │ redis-3 │ + │ • replica │ │ • replica │ + │ • sentinel │ │ • sentinel │ + └─────────────┘ └─────────────┘ +``` + +The 3 sentinels gossip on port `26379` and elect a leader to drive each failover. **Quorum = 2**, so we tolerate one Sentinel crash without losing failover capability. + +## Why Sentinel and not Cluster + +- We don't need sharding at v1.0 — total Redis dataset fits in 1 GB. +- Sentinel is dramatically simpler (no slot management, no resharding). +- The backend's `redis.NewFailoverClient` speaks Sentinel natively ; switching to Cluster would mean rewriting every `Get/Set/Eval` call site. + +When Veza traffic forces sharding (probably v2+), we revisit. + +## Defaults + +| variable | default | meaning | +| ------------------------------------- | ------------------ | --------------------------------------- | +| `redis_master_name` | `veza-master` | Sentinel name. Backend uses this. | +| `redis_port` | `6379` | Redis port | +| `redis_sentinel_port` | `26379` | Sentinel port | +| `redis_sentinel_quorum` | `2` | sentinels that must agree to fail over | +| `redis_sentinel_down_after_ms` | `5000` | ms before "subjectively down" | +| `redis_sentinel_failover_timeout_ms` | `30000` | upper bound on a failover | +| `redis_password` | (vault) | data-plane auth | +| `redis_sentinel_password` | (vault) | sentinel-to-sentinel auth | +| `redis_maxmemory` | `1gb` | hard cap | +| `redis_maxmemory_policy` | `allkeys-lru` | eviction policy | + +## Vault setup + +```yaml +# group_vars/redis_ha.vault.yml — encrypt with `ansible-vault encrypt` +redis_password: "" +redis_sentinel_password: "" +``` + +The role asserts the placeholder values are gone before applying to anything other than `lab`. + +## Backend integration + +The backend reads three new env vars at boot (handled by +`internal/config/redis_init.go`): + +``` +REDIS_SENTINEL_ADDRS=redis-1.lxd:26379,redis-2.lxd:26379,redis-3.lxd:26379 +REDIS_SENTINEL_MASTER_NAME=veza-master +REDIS_SENTINEL_PASSWORD= +REDIS_URL=redis://:@dummy:6379/0 # password + DB still parsed off the URL +``` + +When `REDIS_SENTINEL_ADDRS` is empty, the backend falls back to a single-instance client (the dev/local pattern). + +## Operations + +```bash +# Identify the current master : +redis-cli -h redis-1.lxd -p 26379 -a "$SENTINEL_PASS" SENTINEL get-master-addr-by-name veza-master + +# Force a failover (manual ; for game-day drills) : +redis-cli -h redis-1.lxd -p 26379 -a "$SENTINEL_PASS" SENTINEL failover veza-master + +# Check replication state from any node : +redis-cli -h redis-1.lxd -a "$REDIS_PASS" INFO replication + +# Tail sentinel logs across all 3 : +for n in redis-1 redis-2 redis-3; do + echo "=== $n ===" + ssh "$n" sudo tail -50 /var/log/redis/redis-sentinel.log +done +``` + +## Failover smoke test + +```bash +bash infra/ansible/tests/test_redis_failover.sh +``` + +Sequence : kills the current master container, polls the sentinels until a new master is elected, asserts elapsed time < 30s, verifies `INFO replication` on the survivor shows it's now master. Suitable for the W2 verification gate + game-day day 24. + +## What this role does NOT cover + +- **TLS between client ↔ Redis** — `tls-port` is W4 territory. Today the Incus bridge is the security boundary. +- **Persistent data backups** — RDB snapshots stay on the data node only. Redis state is reconstructible (sessions get re-issued, presence is ephemeral) so this is intentional. +- **Cluster mode (sharding)** — see "Why Sentinel and not Cluster" above. v2+. +- **Cross-host replication** — three containers on the same lab host today. Day 7 of W2 already moved Postgres to dedicated hosts ; the same host-split applies here when Hetzner standby is provisioned (W2 day 7+ note in `postgres_ha.yml`). diff --git a/infra/ansible/roles/redis_sentinel/defaults/main.yml b/infra/ansible/roles/redis_sentinel/defaults/main.yml new file mode 100644 index 000000000..614a1d2f5 --- /dev/null +++ b/infra/ansible/roles/redis_sentinel/defaults/main.yml @@ -0,0 +1,44 @@ +# redis_sentinel defaults — Redis 7 + Sentinel co-located across 3 +# Incus containers (redis-1 master at first boot, redis-2/redis-3 +# replicas; one Sentinel per container = quorum 2 out of 3). +--- +redis_version: "7" # apt provides 7.x on Ubuntu 22.04 +redis_master_name: "veza-master" +redis_port: 6379 +redis_sentinel_port: 26379 + +# Replication / persistence — sane prod defaults. AOF on for durability, +# RDB snapshot still kept for fast restore. +redis_aof_enabled: true +redis_save_config: "3600 1 300 100 60 10000" + +# Sentinel quorum — number of sentinels that must agree before declaring +# the master down. With 3 sentinels, quorum=2 tolerates one sentinel +# crash. Don't lower below 2 in prod, ever. +redis_sentinel_quorum: 2 + +# Failover thresholds — match Day 11 acceptance criterion (< 30s). +# down-after-milliseconds: how long a master must be unreachable before +# a sentinel marks it as subjectively down. +# failover-timeout: max time to wait for replica promotion + reconfig +# before another failover can be triggered. +redis_sentinel_down_after_ms: 5000 # 5s = sentinel quorum decision in ~6-7s +redis_sentinel_failover_timeout_ms: 30000 # 30s budget for the whole flip + +# Auth — required in prod (the Sentinel API can re-route traffic, so +# unauth'd Sentinel = security hole). Override via Vault. +redis_password: "CHANGE_ME_VAULT" +redis_sentinel_password: "CHANGE_ME_VAULT_SENTINEL" + +# bind / protected-mode — bind to the Incus bridge IP only (10.0.x.y). +# protected-mode is OFF because we set bind explicitly + auth is on. +redis_bind: "0.0.0.0" +redis_protected_mode: "no" + +# Resource caps — overall memory limit + eviction policy. The eviction +# policy `allkeys-lru` is intentionally non-zero-data-loss : presence +# keys, sessions, rate-limit counters are all OK to evict under +# pressure. If we add cache lines that MUST persist we'll need a second +# DB with `noeviction`. +redis_maxmemory: "1gb" +redis_maxmemory_policy: "allkeys-lru" diff --git a/infra/ansible/roles/redis_sentinel/handlers/main.yml b/infra/ansible/roles/redis_sentinel/handlers/main.yml new file mode 100644 index 000000000..c35b6c7cd --- /dev/null +++ b/infra/ansible/roles/redis_sentinel/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Restart redis-server + ansible.builtin.service: + name: redis-server + state: restarted + +- name: Restart redis-sentinel + ansible.builtin.service: + name: redis-sentinel + state: restarted diff --git a/infra/ansible/roles/redis_sentinel/tasks/main.yml b/infra/ansible/roles/redis_sentinel/tasks/main.yml new file mode 100644 index 000000000..6a4353e04 --- /dev/null +++ b/infra/ansible/roles/redis_sentinel/tasks/main.yml @@ -0,0 +1,76 @@ +# redis_sentinel role — installs redis-server + redis-sentinel, renders +# both configs from templates, ensures both systemd units running. +# Idempotent — safe to re-apply. +--- +- name: Vault placeholders are overridden in prod + ansible.builtin.assert: + that: + - redis_password != "CHANGE_ME_VAULT" + - redis_sentinel_password != "CHANGE_ME_VAULT_SENTINEL" + fail_msg: | + redis_password and redis_sentinel_password still hold the placeholder + values. Provide them via group_vars/redis_ha.vault.yml (encrypted) + before applying this role to staging or prod. Lab override : run + with `--extra-vars '{"redis_password":"...","redis_sentinel_password":"..."}'`. + when: ansible_user_id != "lab" and (deploy_env | default("lab")) != "lab" + tags: [redis_sentinel, assert] + +- name: Install redis-server + redis-sentinel + ansible.builtin.apt: + name: + - redis-server + - redis-sentinel + - redis-tools # for redis-cli (used by smoke tests) + state: present + update_cache: true + cache_valid_time: 3600 + tags: [redis_sentinel, packages] + +- name: Render redis.conf + ansible.builtin.template: + src: redis.conf.j2 + dest: /etc/redis/redis.conf + owner: redis + group: redis + mode: "0640" + notify: Restart redis-server + tags: [redis_sentinel, config] + +- name: Render sentinel.conf + # Note : the systemd unit for redis-sentinel rewrites this file at + # runtime (Sentinel persists the discovered master address). Render + # it once at first boot ; subsequent applies should NOT clobber the + # in-place edits made by sentinel itself. The `force: false` flag + # makes this idempotent without overwriting Sentinel's state. + ansible.builtin.template: + src: sentinel.conf.j2 + dest: /etc/redis/sentinel.conf + owner: redis + group: redis + mode: "0640" + force: false + notify: Restart redis-sentinel + tags: [redis_sentinel, config] + +- name: Ensure /var/log/redis exists + ansible.builtin.file: + path: /var/log/redis + state: directory + owner: redis + group: redis + mode: "0755" + tags: [redis_sentinel, config] + +- name: Enable + start redis-server + ansible.builtin.service: + name: redis-server + state: started + enabled: true + tags: [redis_sentinel, service] + +- name: Enable + start redis-sentinel + ansible.builtin.service: + name: redis-sentinel + state: started + enabled: true + tags: [redis_sentinel, service] diff --git a/infra/ansible/roles/redis_sentinel/templates/redis.conf.j2 b/infra/ansible/roles/redis_sentinel/templates/redis.conf.j2 new file mode 100644 index 000000000..97e4e064f --- /dev/null +++ b/infra/ansible/roles/redis_sentinel/templates/redis.conf.j2 @@ -0,0 +1,54 @@ +# Managed by Ansible — do not edit by hand. +# Veza Redis 7 config — replication via Sentinel (see sentinel.conf). +# +# Topology at first boot : +# redis-1 : master +# redis-2 : replicaof redis-1.lxd +# redis-3 : replicaof redis-1.lxd +# After failover, Sentinel rewrites this file in-place to point at the +# new master. Do NOT re-render this template after first boot — set +# `force: false` in the Ansible task that owns it. + +bind {{ redis_bind }} +port {{ redis_port }} +protected-mode {{ redis_protected_mode }} +daemonize no +supervised systemd + +requirepass {{ redis_password }} +masterauth {{ redis_password }} + +{% if pg_auto_failover_role is not defined and inventory_hostname != groups['redis_ha_master'][0] %} +# Replicas point at the bootstrap master. Sentinel re-points them on +# failover ; this directive only matters at first boot. +replicaof {{ groups['redis_ha_master'][0] }}.lxd {{ redis_port }} +{% endif %} + +# Replica reads kept on so the backend's read-mostly fanout (chat +# pubsub history, presence GETs) can be served by either replica +# during steady state. +replica-read-only yes + +# Persistence — AOF + occasional RDB. AOF gives ~ 1s RPO with +# everysec ; RDB is fast restore. +{% if redis_aof_enabled %} +appendonly yes +appendfsync everysec +{% else %} +appendonly no +{% endif %} +save {{ redis_save_config }} + +# Memory cap + eviction. Eviction is OK for the use cases we have in +# v1.0 (sessions, rate-limit counters, presence — all reconstructible). +maxmemory {{ redis_maxmemory }} +maxmemory-policy {{ redis_maxmemory_policy }} + +# Logging +logfile /var/log/redis/redis-server.log +loglevel notice + +# Slow log — anything > 10ms gets captured. Useful when we suspect a +# slow Lua script (rate limiter Eval) is back-pressuring. +slowlog-log-slower-than 10000 +slowlog-max-len 256 diff --git a/infra/ansible/roles/redis_sentinel/templates/sentinel.conf.j2 b/infra/ansible/roles/redis_sentinel/templates/sentinel.conf.j2 new file mode 100644 index 000000000..0fd7686cc --- /dev/null +++ b/infra/ansible/roles/redis_sentinel/templates/sentinel.conf.j2 @@ -0,0 +1,28 @@ +# Managed by Ansible — do not edit by hand. +# Sentinel config (rendered once at first boot ; sentinel rewrites +# in-place after that to record discovered topology). + +bind {{ redis_bind }} +port {{ redis_sentinel_port }} +daemonize no +supervised systemd +protected-mode {{ redis_protected_mode }} + +requirepass {{ redis_sentinel_password }} + +# `monitor ` +# host = bootstrap master ; sentinel discovers replicas via INFO REPLICATION +sentinel monitor {{ redis_master_name }} {{ groups['redis_ha_master'][0] }}.lxd {{ redis_port }} {{ redis_sentinel_quorum }} +sentinel auth-pass {{ redis_master_name }} {{ redis_password }} + +# Failover thresholds (Day 11 acceptance : promotion < 30s). +sentinel down-after-milliseconds {{ redis_master_name }} {{ redis_sentinel_down_after_ms }} +sentinel parallel-syncs {{ redis_master_name }} 1 +sentinel failover-timeout {{ redis_master_name }} {{ redis_sentinel_failover_timeout_ms }} + +# Sentinel-to-Sentinel auth. Without this an attacker on the same Incus +# bridge could register a phantom sentinel and trigger a split brain. +sentinel sentinel-pass {{ redis_sentinel_password }} + +logfile /var/log/redis/redis-sentinel.log +loglevel notice diff --git a/infra/ansible/tests/test_redis_failover.sh b/infra/ansible/tests/test_redis_failover.sh new file mode 100755 index 000000000..115efc0ff --- /dev/null +++ b/infra/ansible/tests/test_redis_failover.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +# test_redis_failover.sh — validate Sentinel promotes a replica to master +# in < 30s when the current master dies. +# +# Run on the Incus host that owns the redis-1/2/3 containers (typically +# the lab R720). Assumes the redis_sentinel playbook has been applied +# so the formation is healthy at script start — bails early otherwise. +# +# v1.0.9 W3 Day 11 — acceptance for the verification gate : +# "kill Redis master, verify promotion automatique d'un replica en < 30s". +# +# Usage: +# REDIS_PASS=... SENTINEL_PASS=... bash infra/ansible/tests/test_redis_failover.sh +# +# Exit codes: +# 0 — promotion completed in < 30s (acceptance met) +# 1 — formation not healthy at start +# 2 — promotion did not complete within 30s +# 3 — required tool missing on the host +set -euo pipefail + +REDIS_CONTAINERS=(redis-1 redis-2 redis-3) +MASTER_NAME=${MASTER_NAME:-veza-master} +RTO_TARGET_SECONDS=${RTO_TARGET_SECONDS:-30} +SENTINEL_PORT=${SENTINEL_PORT:-26379} +REDIS_PORT=${REDIS_PORT:-6379} +REDIS_PASS=${REDIS_PASS:-?} +SENTINEL_PASS=${SENTINEL_PASS:-?} + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +fail() { log "FAIL: $*"; exit "${2:-2}"; } + +require() { + command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3 +} + +require incus +require date + +if [ "$REDIS_PASS" = "?" ] || [ "$SENTINEL_PASS" = "?" ]; then + fail "REDIS_PASS and SENTINEL_PASS env vars are required (read them from the vault before invoking)" 3 +fi + +# Helper : ask any sentinel which host:port is currently master. +get_master_addr() { + local ct=$1 + incus exec "$ct" -- redis-cli -p "$SENTINEL_PORT" -a "$SENTINEL_PASS" --no-auth-warning \ + SENTINEL get-master-addr-by-name "$MASTER_NAME" 2>/dev/null | tr '\n' ' ' +} + +# ----------------------------------------------------------------------------- +# 0. Sanity — formation must be healthy at start. +# ----------------------------------------------------------------------------- +log "step 0: pre-flight — Sentinel reports current master" +master_addr_before=$(get_master_addr "${REDIS_CONTAINERS[0]}") +if [ -z "$master_addr_before" ]; then + fail "no master visible in Sentinel — refusing to test from a degraded baseline" 1 +fi +log "current master (host port) : $master_addr_before" + +# Resolve which container hosts the current master so we know whom to kill. +master_host=$(echo "$master_addr_before" | awk '{print $1}') +master_container="" +for ct in "${REDIS_CONTAINERS[@]}"; do + ip=$(incus list "$ct" -c 4 -f csv 2>/dev/null | head -1 | awk '{print $1}' | tr -d ',') + # accept either the .lxd hostname or the IP. The .lxd suffix is what + # sentinel.conf hands out ; the IP is what `incus list` shows. + if [ "$ct.lxd" = "$master_host" ] || [ "$ip" = "$master_host" ]; then + master_container=$ct + break + fi +done + +if [ -z "$master_container" ]; then + fail "could not map master host '$master_host' to a known container" 1 +fi +log "master container resolved to: $master_container" + +# ----------------------------------------------------------------------------- +# 1. Kill master container — simulates hardware/process death. +# ----------------------------------------------------------------------------- +log "step 1: stopping $master_container — start timer" +t0=$(date +%s) +incus stop --force "$master_container" + +# ----------------------------------------------------------------------------- +# 2. Poll surviving sentinels until they announce a new master. +# ----------------------------------------------------------------------------- +log "step 2: polling sentinels for new master (target RTO ${RTO_TARGET_SECONDS}s)" +deadline=$((t0 + RTO_TARGET_SECONDS)) +promoted=0 +new_master="" +while [ "$(date +%s)" -lt "$deadline" ]; do + for ct in "${REDIS_CONTAINERS[@]}"; do + if [ "$ct" = "$master_container" ]; then continue; fi + addr=$(get_master_addr "$ct") + if [ -n "$addr" ] && [ "$addr" != "$master_addr_before" ]; then + new_master=$addr + promoted=1 + break 2 + fi + done + sleep 1 +done + +t1=$(date +%s) +elapsed=$((t1 - t0)) + +# ----------------------------------------------------------------------------- +# 3. Restart the killed container so it can rejoin as replica for the +# next run. +# ----------------------------------------------------------------------------- +log "step 3: restarting $master_container (will rejoin as replica once it catches up)" +incus start "$master_container" || true + +# ----------------------------------------------------------------------------- +# 4. Verdict. +# ----------------------------------------------------------------------------- +if [ "$promoted" -eq 1 ] && [ "$elapsed" -le "$RTO_TARGET_SECONDS" ]; then + log "PASS: master flipped from '$master_addr_before' to '$new_master' in ${elapsed}s (target ${RTO_TARGET_SECONDS}s)" + exit 0 +fi + +log "final Sentinel view:" +for ct in "${REDIS_CONTAINERS[@]}"; do + if [ "$ct" = "$master_container" ]; then continue; fi + echo " $ct: $(get_master_addr "$ct")" >&2 +done +fail "no replica promoted within ${RTO_TARGET_SECONDS}s (elapsed ${elapsed}s, promoted=${promoted})" diff --git a/veza-backend-api/.env.template b/veza-backend-api/.env.template index a1d1453c8..b584e7a34 100644 --- a/veza-backend-api/.env.template +++ b/veza-backend-api/.env.template @@ -56,6 +56,13 @@ REDIS_URL=redis://veza.fr:16379 REDIS_ADDR=veza.fr:6379 REDIS_PASSWORD= REDIS_DB=0 +# v1.0.9 W3 Day 11 — Sentinel HA. Leave REDIS_SENTINEL_ADDRS empty for +# single-instance dev. Set in prod to enable redis.NewFailoverClient. +# Comma-separated host:port list ; the master name must match +# `sentinel monitor` in sentinel.conf. +REDIS_SENTINEL_ADDRS= +REDIS_SENTINEL_MASTER_NAME=veza-master +REDIS_SENTINEL_PASSWORD= # --- RABBITMQ --- # Enable message queue for async events (use veza:password, host port 15672 for docker-compose) diff --git a/veza-backend-api/internal/config/config.go b/veza-backend-api/internal/config/config.go index aa10ee7d5..3b3dd6088 100644 --- a/veza-backend-api/internal/config/config.go +++ b/veza-backend-api/internal/config/config.go @@ -74,6 +74,13 @@ type Config struct { ChatJWTSecret string // Secret pour les tokens WebSocket Chat RedisURL string RedisEnable bool // Enable/Disable Redis + // v1.0.9 Day 11 — Redis Sentinel HA. When SentinelAddrs is non-empty, + // initRedis switches to redis.NewFailoverClient and points at these + // sentinels instead of dialing the URL above. The URL is still read + // (auth + DB index parsed off it) so single-instance dev keeps working. + RedisSentinelAddrs []string + RedisSentinelMasterName string + RedisSentinelPassword string DatabaseURL string DatabaseReadURL string // Optional read replica URL (DATABASE_READ_URL) UploadDir string // Répertoire d'upload @@ -372,8 +379,11 @@ func NewConfig() (*Config, error) { JWTIssuer: getEnv("JWT_ISSUER", "veza-api"), JWTAudience: getEnv("JWT_AUDIENCE", "veza-platform"), ChatJWTSecret: getEnv("CHAT_JWT_SECRET", jwtSecret), - RedisURL: getEnv("REDIS_URL", "redis://"+appDomain+":6379"), - RedisEnable: getEnvBool("REDIS_ENABLE", true), + RedisURL: getEnv("REDIS_URL", "redis://"+appDomain+":6379"), + RedisEnable: getEnvBool("REDIS_ENABLE", true), + RedisSentinelAddrs: parseRedisSentinelAddrs(getEnv("REDIS_SENTINEL_ADDRS", "")), + RedisSentinelMasterName: getEnv("REDIS_SENTINEL_MASTER_NAME", "veza-master"), + RedisSentinelPassword: getEnv("REDIS_SENTINEL_PASSWORD", ""), // SECURITY: DATABASE_URL est REQUIS - contient des credentials sensibles DatabaseURL: databaseURL, DatabaseReadURL: getEnv("DATABASE_READ_URL", ""), @@ -684,7 +694,13 @@ func NewConfig() (*Config, error) { // Initialiser Redis if config.RedisEnable { - config.RedisClient, err = initRedis(config.RedisURL, redisLoggerZap) + config.RedisClient, err = initRedis( + config.RedisURL, + config.RedisSentinelAddrs, + config.RedisSentinelMasterName, + config.RedisSentinelPassword, + redisLoggerZap, + ) if err != nil { // CRITICAL: Protect logger calls from broken pipe errors func() { diff --git a/veza-backend-api/internal/config/redis_init.go b/veza-backend-api/internal/config/redis_init.go index bfa191bec..14dc5ce50 100644 --- a/veza-backend-api/internal/config/redis_init.go +++ b/veza-backend-api/internal/config/redis_init.go @@ -9,8 +9,11 @@ import ( "go.uber.org/zap" ) -// initRedis initialise la connexion Redis -func initRedis(redisURL string, logger *zap.Logger) (*redis.Client, error) { +// initRedis initialise la connexion Redis. v1.0.9 Day 11 : when +// `sentinelAddrs` is non-empty, we wire a Sentinel-aware FailoverClient +// instead of a direct connection. The URL is still consulted for +// password + DB index — Sentinel discovers the host:port pair. +func initRedis(redisURL string, sentinelAddrs []string, sentinelMasterName, sentinelPassword string, logger *zap.Logger) (*redis.Client, error) { opts, err := redis.ParseURL(redisURL) if err != nil { return nil, err @@ -19,7 +22,27 @@ func initRedis(redisURL string, logger *zap.Logger) (*redis.Client, error) { // Configurer un logger filtré pour Redis pour éviter les warnings "maint_notifications" redis.SetLogger(&filteredRedisLogger{logger: logger}) - client := redis.NewClient(opts) + var client *redis.Client + if len(sentinelAddrs) > 0 { + // FailoverClient : Sentinel discovers the current master and + // transparently re-resolves on failover. `MasterName` MUST match + // the value in sentinel.conf (`monitor `). + client = redis.NewFailoverClient(&redis.FailoverOptions{ + MasterName: sentinelMasterName, + SentinelAddrs: sentinelAddrs, + SentinelPassword: sentinelPassword, + // Auth + db reused from the parsed URL so dev/prod stay parametric. + Password: opts.Password, + DB: opts.DB, + // TLS cherrypicked from the URL (rediss://). + TLSConfig: opts.TLSConfig, + }) + logger.Info("Redis Sentinel HA wired", + zap.Strings("sentinels", sentinelAddrs), + zap.String("master", sentinelMasterName)) + } else { + client = redis.NewClient(opts) + } // Test de connexion ctx := context.Background() @@ -31,6 +54,28 @@ func initRedis(redisURL string, logger *zap.Logger) (*redis.Client, error) { return client, nil } +// parseRedisSentinelAddrs splits the comma-separated REDIS_SENTINEL_ADDRS +// env into a clean slice. Empty input -> nil (initRedis falls back to +// single-instance). Trims whitespace + drops empty entries so a typo +// like "a, ,b" doesn't dial a phantom sentinel. +func parseRedisSentinelAddrs(raw string) []string { + if raw == "" { + return nil + } + parts := strings.Split(raw, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + out = append(out, p) + } + } + if len(out) == 0 { + return nil + } + return out +} + // filteredRedisLogger est un wrapper pour filtrer les logs de Redis type filteredRedisLogger struct { logger *zap.Logger diff --git a/veza-backend-api/internal/metrics/cache_hit_rate.go b/veza-backend-api/internal/metrics/cache_hit_rate.go new file mode 100644 index 000000000..ba8ae928e --- /dev/null +++ b/veza-backend-api/internal/metrics/cache_hit_rate.go @@ -0,0 +1,53 @@ +package metrics + +// Cache hit/miss counters per subsystem (v1.0.9 W3 Day 11). +// +// Three call-sites instrumented in v1.0.9: +// - rate_limiter — Redis INCR result classified as "hit" if the key +// already existed in the window (in-window request), +// "miss" if it was a new window (key just created). +// - chat_pubsub — "hit" on a successful Publish/Subscribe round-trip, +// "miss" on connection error (Redis unreachable). +// - presence — "hit" on a successful Get/Set/Del, "miss" on a key +// that didn't exist (presence stale or never set) or +// on an underlying Redis error. +// +// Subsystems are passed as labels rather than baked into separate metrics +// so dashboards can pivot. Cardinality is fixed at the three values above +// (plus future additions in W3+); never label by user_id / room_id / +// per-key — that would explode cardinality. + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + cacheHits = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "veza_cache_hits_total", + Help: "Total cache hits per subsystem", + }, + []string{"subsystem"}, + ) + + cacheMisses = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "veza_cache_misses_total", + Help: "Total cache misses per subsystem", + }, + []string{"subsystem"}, + ) +) + +// RecordCacheHit increments the hit counter for a subsystem. Subsystem +// must be one of the bounded set documented at file-level — adding a +// new value is a deliberate choice that should also update Grafana. +func RecordCacheHit(subsystem string) { + cacheHits.WithLabelValues(subsystem).Inc() +} + +// RecordCacheMiss increments the miss counter for a subsystem. +func RecordCacheMiss(subsystem string) { + cacheMisses.WithLabelValues(subsystem).Inc() +} diff --git a/veza-backend-api/internal/middleware/rate_limiter.go b/veza-backend-api/internal/middleware/rate_limiter.go index d12e4d7aa..fc179ee39 100644 --- a/veza-backend-api/internal/middleware/rate_limiter.go +++ b/veza-backend-api/internal/middleware/rate_limiter.go @@ -9,6 +9,8 @@ import ( "sync" "time" + "veza-backend-api/internal/metrics" + "github.com/gin-gonic/gin" "github.com/google/uuid" "github.com/redis/go-redis/v9" @@ -192,8 +194,12 @@ func checkRedisLimit1s(ctx context.Context, redisClient *redis.Client, key strin ` result, err := redisClient.Eval(ctx, script, []string{key}, limit, ddosWindowSeconds).Result() if err != nil { + // Redis unreachable: caller falls back to in-memory limiter. + // "miss" here = Redis didn't deliver a verdict. + metrics.RecordCacheMiss("rate_limiter") return false, 0, err } + metrics.RecordCacheHit("rate_limiter") results := result.([]interface{}) allowed := results[0].(int64) == 1 remaining := int(results[1].(int64)) @@ -454,6 +460,7 @@ func FrontendLogRateLimit(redisClient *redis.Client) gin.HandlerFunc { ).Result() if err == nil { + metrics.RecordCacheHit("rate_limiter") results := result.([]interface{}) allowed := results[0].(int64) == 1 remaining := int(results[1].(int64)) @@ -472,6 +479,7 @@ func FrontendLogRateLimit(redisClient *redis.Client) gin.HandlerFunc { c.Next() return } + metrics.RecordCacheMiss("rate_limiter") } // Fail-secure: Redis error or nil — use in-memory fallback @@ -572,6 +580,7 @@ func UploadRateLimit(redisClient *redis.Client) gin.HandlerFunc { ).Result() if err == nil { + metrics.RecordCacheHit("rate_limiter") results := result.([]interface{}) allowed := results[0].(int64) == 1 remaining := int(results[1].(int64)) @@ -591,6 +600,7 @@ func UploadRateLimit(redisClient *redis.Client) gin.HandlerFunc { c.Next() return } + metrics.RecordCacheMiss("rate_limiter") } // Fail-secure: Redis error or nil — use in-memory fallback diff --git a/veza-backend-api/internal/services/chat_pubsub.go b/veza-backend-api/internal/services/chat_pubsub.go index 3f8cb7c70..76f995624 100644 --- a/veza-backend-api/internal/services/chat_pubsub.go +++ b/veza-backend-api/internal/services/chat_pubsub.go @@ -4,6 +4,8 @@ import ( "context" "sync" + "veza-backend-api/internal/metrics" + "github.com/google/uuid" "github.com/redis/go-redis/v9" "go.uber.org/zap" @@ -42,6 +44,7 @@ func (s *ChatPubSubService) Publish(ctx context.Context, roomID uuid.UUID, messa if s.redisClient != nil { if err := s.redisClient.Publish(ctx, channel, message).Err(); err != nil { + metrics.RecordCacheMiss("chat_pubsub") // ERROR, not Warn: the in-memory fallback only reaches subscribers // on this pod — a multi-pod chat becomes partitioned until Redis // recovers. Operators should page on this log line. @@ -50,10 +53,13 @@ func (s *ChatPubSubService) Publish(ctx context.Context, roomID uuid.UUID, messa zap.Error(err), ) s.publishInMemory(channel, message) + return nil } + metrics.RecordCacheHit("chat_pubsub") return nil } + metrics.RecordCacheMiss("chat_pubsub") s.publishInMemory(channel, message) return nil } @@ -93,9 +99,15 @@ func (s *ChatPubSubService) PublishPresence(ctx context.Context, event []byte) e channel := "chat:presence" if s.redisClient != nil { - return s.redisClient.Publish(ctx, channel, event).Err() + if err := s.redisClient.Publish(ctx, channel, event).Err(); err != nil { + metrics.RecordCacheMiss("chat_pubsub") + return err + } + metrics.RecordCacheHit("chat_pubsub") + return nil } + metrics.RecordCacheMiss("chat_pubsub") s.publishInMemory(channel, event) return nil } diff --git a/veza-backend-api/internal/websocket/chat/presence_service.go b/veza-backend-api/internal/websocket/chat/presence_service.go index 36549e357..b15f3c646 100644 --- a/veza-backend-api/internal/websocket/chat/presence_service.go +++ b/veza-backend-api/internal/websocket/chat/presence_service.go @@ -6,6 +6,8 @@ import ( "fmt" "time" + "veza-backend-api/internal/metrics" + "github.com/google/uuid" "github.com/redis/go-redis/v9" "go.uber.org/zap" @@ -58,10 +60,11 @@ func (s *ChatPresenceService) SetOnline(ctx context.Context, userID uuid.UUID) e } if err := s.redis.Set(ctx, s.presenceKey(userID), data, presenceTTL).Err(); err != nil { + metrics.RecordCacheMiss("presence") s.logger.Warn("Failed to set online presence", zap.Error(err), zap.String("user_id", userID.String())) return fmt.Errorf("set presence: %w", err) } - + metrics.RecordCacheHit("presence") return nil } @@ -71,10 +74,11 @@ func (s *ChatPresenceService) SetOffline(ctx context.Context, userID uuid.UUID) } if err := s.redis.Del(ctx, s.presenceKey(userID)).Err(); err != nil { + metrics.RecordCacheMiss("presence") s.logger.Warn("Failed to delete presence", zap.Error(err), zap.String("user_id", userID.String())) return fmt.Errorf("delete presence: %w", err) } - + metrics.RecordCacheHit("presence") return nil } @@ -95,10 +99,11 @@ func (s *ChatPresenceService) Heartbeat(ctx context.Context, userID uuid.UUID) e } if err := s.redis.Set(ctx, s.presenceKey(userID), data, presenceTTL).Err(); err != nil { + metrics.RecordCacheMiss("presence") s.logger.Warn("Failed to heartbeat presence", zap.Error(err), zap.String("user_id", userID.String())) return fmt.Errorf("heartbeat presence: %w", err) } - + metrics.RecordCacheHit("presence") return nil } @@ -109,11 +114,17 @@ func (s *ChatPresenceService) GetPresence(ctx context.Context, userID uuid.UUID) data, err := s.redis.Get(ctx, s.presenceKey(userID)).Bytes() if err == redis.Nil { + // "redis.Nil" = key doesn't exist = user is offline. That's a + // legitimate read result, not an error — count as a hit so the + // hit-rate metric reflects "Redis answered correctly". + metrics.RecordCacheHit("presence") return &PresenceInfo{UserID: userID, Online: false}, nil } if err != nil { + metrics.RecordCacheMiss("presence") return nil, fmt.Errorf("get presence: %w", err) } + metrics.RecordCacheHit("presence") var info PresenceInfo if err := json.Unmarshal(data, &info); err != nil {