#!/usr/bin/env bash # monitor-checks.sh — poll the soft-launch acceptance gate live during # the bêta window so the operator gets a heads-up before the decision # call instead of discovering at 18:00 UTC that one threshold is red. # # Acceptance gate (per docs/SOFT_LAUNCH_BETA_2026.md §"Acceptance gate") : # - ≥ 50 testers signed up (used_at != NULL on beta_invites) # - 0 P1 events in Sentry today # - Status page green for the last 4 h # - Synthetic parcours all green for 6 h # - Nightly k6 load test green # - < 3 HIGH-severity issues reported # # v1.0.10 Cluster 3.4. # # Usage : # DATABASE_URL=postgres://... \ # SENTRY_AUTH_TOKEN=... \ # STATUSPAGE_URL=https://status.veza.fr \ # PROM_URL=https://prom.veza.fr \ # bash scripts/soft-launch/monitor-checks.sh # # By default the script runs once and exits with the gate's verdict. # Run it from cron (e.g. every 30 min) or pass LOOP=1 to keep checking # in-place every CHECK_INTERVAL seconds (default 600 = 10 min). # # Optional env : # LOOP=1 continuous mode # CHECK_INTERVAL seconds between checks in LOOP mode (default 600) # QUIET=1 only emit the verdict line (for cron piping) # THRESHOLD_TESTERS override 50 (default), e.g. set to 100 for # a stricter sub-window # # Exit codes : # 0 — every gate green # 1 — at least one gate red # 2 — at least one gate could not be checked (collector down, # token wrong, etc.) — operator must verify manually # 3 — required env / tool missing set -euo pipefail DATABASE_URL=${DATABASE_URL:-?} SENTRY_AUTH_TOKEN=${SENTRY_AUTH_TOKEN:-?} STATUSPAGE_URL=${STATUSPAGE_URL:-https://status.veza.fr} PROM_URL=${PROM_URL:-?} LOOP=${LOOP:-0} CHECK_INTERVAL=${CHECK_INTERVAL:-600} QUIET=${QUIET:-0} THRESHOLD_TESTERS=${THRESHOLD_TESTERS:-50} [ "$DATABASE_URL" = "?" ] && { echo "DATABASE_URL required" >&2; exit 3; } [ "$SENTRY_AUTH_TOKEN" = "?" ] && { echo "SENTRY_AUTH_TOKEN required (read scope sufficient)" >&2; exit 3; } [ "$PROM_URL" = "?" ] && { echo "PROM_URL required" >&2; exit 3; } command -v psql >/dev/null 2>&1 || { echo "psql required" >&2; exit 3; } command -v curl >/dev/null 2>&1 || { echo "curl required" >&2; exit 3; } command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; } # ---------------------------------------------------------------------- # Individual gate checks. Each prints "✅ " / "🔴 " / "⚪ " # (last for "could not check"), and sets one of GATE_*_OK to 0 / 1 / 2. # ---------------------------------------------------------------------- GATE_TESTERS_OK=2 GATE_SENTRY_OK=2 GATE_STATUSPAGE_OK=2 GATE_SYNTHETIC_OK=2 GATE_K6_OK=2 GATE_ISSUES_OK=2 check_testers() { local count count=$(psql "$DATABASE_URL" -A -t -c " SELECT count(*) FROM beta_invites WHERE used_at IS NOT NULL; " 2>/dev/null | tr -d ' ' || echo "?") if [ "$count" = "?" ] || ! [[ "$count" =~ ^[0-9]+$ ]]; then echo "⚪ testers signed-up : check failed (psql)" GATE_TESTERS_OK=2 return fi if [ "$count" -ge "$THRESHOLD_TESTERS" ]; then echo "✅ testers signed-up : $count / $THRESHOLD_TESTERS" GATE_TESTERS_OK=0 else echo "🔴 testers signed-up : $count / $THRESHOLD_TESTERS" GATE_TESTERS_OK=1 fi } check_sentry_p1() { # Sentry API : count of unresolved P1 issues last 24h. local count count=$(curl -s -H "Authorization: Bearer $SENTRY_AUTH_TOKEN" \ "https://sentry.io/api/0/projects/veza/veza-backend/issues/?statsPeriod=24h&query=is:unresolved%20level:fatal" \ 2>/dev/null | jq 'length' 2>/dev/null || echo "?") if [ "$count" = "?" ] || ! [[ "$count" =~ ^[0-9]+$ ]]; then echo "⚪ Sentry P1 events 24h : check failed (auth or network)" GATE_SENTRY_OK=2 return fi if [ "$count" -eq 0 ]; then echo "✅ Sentry P1 events 24h : 0" GATE_SENTRY_OK=0 else echo "🔴 Sentry P1 events 24h : $count (must be 0)" GATE_SENTRY_OK=1 fi } check_statuspage() { local status status=$(curl -s "$STATUSPAGE_URL/api/v1/status" 2>/dev/null \ | jq -r '.indicator // .status.indicator // ""' 2>/dev/null || echo "") case "$status" in none|operational) echo "✅ status page : $status (green)" GATE_STATUSPAGE_OK=0 ;; minor|major|critical) echo "🔴 status page : $status" GATE_STATUSPAGE_OK=1 ;; *) echo "⚪ status page : check failed (got '$status')" GATE_STATUSPAGE_OK=2 ;; esac } check_synthetic() { # PromQL : sum of probe_success over the last 6h ; expect every # parcours at 1 (success). local query='probe_success{probe_kind="synthetic"} == 0' local resp resp=$(curl -s --get "$PROM_URL/api/v1/query" \ --data-urlencode "query=$query" 2>/dev/null) local result_count result_count=$(echo "$resp" | jq '.data.result | length' 2>/dev/null || echo "?") if [ "$result_count" = "?" ] || ! [[ "$result_count" =~ ^[0-9]+$ ]]; then echo "⚪ synthetic parcours : check failed (Prometheus)" GATE_SYNTHETIC_OK=2 return fi if [ "$result_count" -eq 0 ]; then echo "✅ synthetic parcours : all green" GATE_SYNTHETIC_OK=0 else local failing failing=$(echo "$resp" | jq -r '.data.result[].metric.parcours' 2>/dev/null | tr '\n' ',' | sed 's/,$//') echo "🔴 synthetic parcours : $result_count failing ($failing)" GATE_SYNTHETIC_OK=1 fi } check_k6_nightly() { # k6 nightly is exposed as veza_k6_nightly_last_success_timestamp_seconds # by the Forgejo runner workflow's textfile-collector. Reading via Prom # gives "is the last success < 30h old?". local query='time() - veza_k6_nightly_last_success_timestamp_seconds' local resp age resp=$(curl -s --get "$PROM_URL/api/v1/query" \ --data-urlencode "query=$query" 2>/dev/null) age=$(echo "$resp" | jq -r '.data.result[0].value[1] // ""' 2>/dev/null) if [ -z "$age" ] || [ "$age" = "null" ]; then echo "⚪ k6 nightly : check failed (metric absent — runner offline?)" GATE_K6_OK=2 return fi age_int=$(printf '%.0f' "$age" 2>/dev/null || echo 999999) if [ "$age_int" -lt 108000 ]; then # 30h echo "✅ k6 nightly : last success $(( age_int / 3600 ))h ago" GATE_K6_OK=0 else echo "🔴 k6 nightly : last success $(( age_int / 3600 ))h ago (> 30h)" GATE_K6_OK=1 fi } check_high_issues() { # The operator-reported issues count lives in the SOFT_LAUNCH_BETA_2026.md # report under "Issues reported". Without an external tracker we read it # from a known location in the report file. Skip if file absent. local report report="$(cd "$(dirname "$0")/../.." && pwd)/docs/SOFT_LAUNCH_BETA_2026.md" if [ ! -f "$report" ]; then echo "⚪ HIGH issues count : report file not found" GATE_ISSUES_OK=2 return fi local count count=$(grep -cE '^\| HIGH ' "$report" 2>/dev/null || echo 0) if [ "$count" -lt 3 ]; then echo "✅ HIGH-severity issues reported : $count / < 3" GATE_ISSUES_OK=0 else echo "🔴 HIGH-severity issues reported : $count / < 3" GATE_ISSUES_OK=1 fi } # ---------------------------------------------------------------------- # Main loop # ---------------------------------------------------------------------- run_once() { if [ "$QUIET" != "1" ]; then echo "================================================================" echo "Acceptance gate check — $(date -u +'%Y-%m-%d %H:%M:%S UTC')" echo "----------------------------------------------------------------" fi check_testers check_sentry_p1 check_statuspage check_synthetic check_k6_nightly check_high_issues if [ "$QUIET" != "1" ]; then echo "----------------------------------------------------------------" fi local red=0 unknown=0 for v in "$GATE_TESTERS_OK" "$GATE_SENTRY_OK" "$GATE_STATUSPAGE_OK" \ "$GATE_SYNTHETIC_OK" "$GATE_K6_OK" "$GATE_ISSUES_OK"; do case $v in 1) red=$(( red + 1 )) ;; 2) unknown=$(( unknown + 1 )) ;; esac done if [ "$red" -eq 0 ] && [ "$unknown" -eq 0 ]; then echo "VERDICT : ALL GATES GREEN — soft-launch is GO" return 0 elif [ "$red" -gt 0 ]; then echo "VERDICT : $red gate(s) RED — NO-GO until resolved" return 1 else echo "VERDICT : $unknown gate(s) UNCHECKABLE — operator must verify manually before decision call" return 2 fi } if [ "$LOOP" != "1" ]; then run_once exit $? fi # Continuous mode. while true; do run_once || true echo "" echo "next check in ${CHECK_INTERVAL}s — Ctrl-C to exit" sleep "$CHECK_INTERVAL" done