Some checks failed
Veza CI / Backend (Go) (push) Failing after 8m56s
Veza CI / Frontend (Web) (push) Has been cancelled
E2E Playwright / e2e (full) (push) Has been cancelled
Veza CI / Notify on failure (push) Blocked by required conditions
Veza CI / Rust (Stream Server) (push) Successful in 5m3s
Security Scan / Secret Scanning (gitleaks) (push) Failing after 53s
Three Incus containers, each running redis-server + redis-sentinel (co-located). redis-1 = master at first boot, redis-2/3 = replicas. Sentinel quorum=2 of 3 ; failover-timeout=30s satisfies the W3 acceptance criterion. - internal/config/redis_init.go : initRedis branches on REDIS_SENTINEL_ADDRS ; non-empty -> redis.NewFailoverClient with MasterName + SentinelAddrs + SentinelPassword. Empty -> existing single-instance NewClient (dev/local stays parametric). - internal/config/config.go : 3 new fields (RedisSentinelAddrs, RedisSentinelMasterName, RedisSentinelPassword) read from env. parseRedisSentinelAddrs trims+filters CSV. - internal/metrics/cache_hit_rate.go : new RecordCacheHit / Miss counters, labelled by subsystem. Cardinality bounded. - internal/middleware/rate_limiter.go : instrument 3 Eval call sites (DDoS, frontend log throttle, upload throttle). Hit = Redis answered, Miss = error -> in-memory fallback. - internal/services/chat_pubsub.go : instrument Publish + PublishPresence. - internal/websocket/chat/presence_service.go : instrument SetOnline / SetOffline / Heartbeat / GetPresence. redis.Nil counts as a hit (legitimate empty result). - infra/ansible/roles/redis_sentinel/ : install Redis 7 + Sentinel, render redis.conf + sentinel.conf, systemd units. Vault assertion prevents shipping placeholder passwords to staging/prod. - infra/ansible/playbooks/redis_sentinel.yml : provisions the 3 containers + applies common baseline + role. - infra/ansible/inventory/lab.yml : new groups redis_ha + redis_ha_master. - infra/ansible/tests/test_redis_failover.sh : kills the master container, polls Sentinel for the new master, asserts elapsed < 30s. - config/grafana/dashboards/redis-cache-overview.json : 3 hit-rate stats (rate_limiter / chat_pubsub / presence) + ops/s breakdown. - docs/ENV_VARIABLES.md §3 : 3 new REDIS_SENTINEL_* env vars. - veza-backend-api/.env.template : 3 placeholders (empty default). Acceptance (Day 11) : Sentinel failover < 30s ; cache hit-rate dashboard populated. Lab test pending Sentinel deployment. W3 verification gate progress : Redis Sentinel ✓ (this commit), MinIO EC4+2 ⏳ Day 12, CDN ⏳ Day 13, DMCA ⏳ Day 14, embed ⏳ Day 15. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
129 lines
5 KiB
Bash
Executable file
129 lines
5 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# test_redis_failover.sh — validate Sentinel promotes a replica to master
|
|
# in < 30s when the current master dies.
|
|
#
|
|
# Run on the Incus host that owns the redis-1/2/3 containers (typically
|
|
# the lab R720). Assumes the redis_sentinel playbook has been applied
|
|
# so the formation is healthy at script start — bails early otherwise.
|
|
#
|
|
# v1.0.9 W3 Day 11 — acceptance for the verification gate :
|
|
# "kill Redis master, verify promotion automatique d'un replica en < 30s".
|
|
#
|
|
# Usage:
|
|
# REDIS_PASS=... SENTINEL_PASS=... bash infra/ansible/tests/test_redis_failover.sh
|
|
#
|
|
# Exit codes:
|
|
# 0 — promotion completed in < 30s (acceptance met)
|
|
# 1 — formation not healthy at start
|
|
# 2 — promotion did not complete within 30s
|
|
# 3 — required tool missing on the host
|
|
set -euo pipefail
|
|
|
|
REDIS_CONTAINERS=(redis-1 redis-2 redis-3)
|
|
MASTER_NAME=${MASTER_NAME:-veza-master}
|
|
RTO_TARGET_SECONDS=${RTO_TARGET_SECONDS:-30}
|
|
SENTINEL_PORT=${SENTINEL_PORT:-26379}
|
|
REDIS_PORT=${REDIS_PORT:-6379}
|
|
REDIS_PASS=${REDIS_PASS:-?}
|
|
SENTINEL_PASS=${SENTINEL_PASS:-?}
|
|
|
|
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
|
|
fail() { log "FAIL: $*"; exit "${2:-2}"; }
|
|
|
|
require() {
|
|
command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3
|
|
}
|
|
|
|
require incus
|
|
require date
|
|
|
|
if [ "$REDIS_PASS" = "?" ] || [ "$SENTINEL_PASS" = "?" ]; then
|
|
fail "REDIS_PASS and SENTINEL_PASS env vars are required (read them from the vault before invoking)" 3
|
|
fi
|
|
|
|
# Helper : ask any sentinel which host:port is currently master.
|
|
get_master_addr() {
|
|
local ct=$1
|
|
incus exec "$ct" -- redis-cli -p "$SENTINEL_PORT" -a "$SENTINEL_PASS" --no-auth-warning \
|
|
SENTINEL get-master-addr-by-name "$MASTER_NAME" 2>/dev/null | tr '\n' ' '
|
|
}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 0. Sanity — formation must be healthy at start.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 0: pre-flight — Sentinel reports current master"
|
|
master_addr_before=$(get_master_addr "${REDIS_CONTAINERS[0]}")
|
|
if [ -z "$master_addr_before" ]; then
|
|
fail "no master visible in Sentinel — refusing to test from a degraded baseline" 1
|
|
fi
|
|
log "current master (host port) : $master_addr_before"
|
|
|
|
# Resolve which container hosts the current master so we know whom to kill.
|
|
master_host=$(echo "$master_addr_before" | awk '{print $1}')
|
|
master_container=""
|
|
for ct in "${REDIS_CONTAINERS[@]}"; do
|
|
ip=$(incus list "$ct" -c 4 -f csv 2>/dev/null | head -1 | awk '{print $1}' | tr -d ',')
|
|
# accept either the .lxd hostname or the IP. The .lxd suffix is what
|
|
# sentinel.conf hands out ; the IP is what `incus list` shows.
|
|
if [ "$ct.lxd" = "$master_host" ] || [ "$ip" = "$master_host" ]; then
|
|
master_container=$ct
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ -z "$master_container" ]; then
|
|
fail "could not map master host '$master_host' to a known container" 1
|
|
fi
|
|
log "master container resolved to: $master_container"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 1. Kill master container — simulates hardware/process death.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 1: stopping $master_container — start timer"
|
|
t0=$(date +%s)
|
|
incus stop --force "$master_container"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 2. Poll surviving sentinels until they announce a new master.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 2: polling sentinels for new master (target RTO ${RTO_TARGET_SECONDS}s)"
|
|
deadline=$((t0 + RTO_TARGET_SECONDS))
|
|
promoted=0
|
|
new_master=""
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
|
for ct in "${REDIS_CONTAINERS[@]}"; do
|
|
if [ "$ct" = "$master_container" ]; then continue; fi
|
|
addr=$(get_master_addr "$ct")
|
|
if [ -n "$addr" ] && [ "$addr" != "$master_addr_before" ]; then
|
|
new_master=$addr
|
|
promoted=1
|
|
break 2
|
|
fi
|
|
done
|
|
sleep 1
|
|
done
|
|
|
|
t1=$(date +%s)
|
|
elapsed=$((t1 - t0))
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 3. Restart the killed container so it can rejoin as replica for the
|
|
# next run.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 3: restarting $master_container (will rejoin as replica once it catches up)"
|
|
incus start "$master_container" || true
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 4. Verdict.
|
|
# -----------------------------------------------------------------------------
|
|
if [ "$promoted" -eq 1 ] && [ "$elapsed" -le "$RTO_TARGET_SECONDS" ]; then
|
|
log "PASS: master flipped from '$master_addr_before' to '$new_master' in ${elapsed}s (target ${RTO_TARGET_SECONDS}s)"
|
|
exit 0
|
|
fi
|
|
|
|
log "final Sentinel view:"
|
|
for ct in "${REDIS_CONTAINERS[@]}"; do
|
|
if [ "$ct" = "$master_container" ]; then continue; fi
|
|
echo " $ct: $(get_master_addr "$ct")" >&2
|
|
done
|
|
fail "no replica promoted within ${RTO_TARGET_SECONDS}s (elapsed ${elapsed}s, promoted=${promoted})"
|