From c78bf1b765470cda643f1c2fa3ec726ea8cd4cdd Mon Sep 17 00:00:00 2001 From: senke Date: Tue, 28 Apr 2026 01:30:34 +0200 Subject: [PATCH] feat(observability): SLO burn-rate alerts + 7 runbook stubs (W2 Day 10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three SLOs with multi-window burn-rate alerts (Google SRE workbook methodology) : * SLO_API_AVAILABILITY : 99.5% on read (GET) endpoints * SLO_API_LATENCY : 99% writes p95 < 500ms * SLO_PAYMENT_SUCCESS : 99.5% on POST /api/v1/orders -> 2xx Each SLO has two alerts : * SLOFastBurn — page-grade, 2% budget burned in 1h (1h+5m windows) * SLOSlowBurn — ticket-grade, 5% budget burned in 6h (6h+30m) - config/prometheus/slo.yml : 12 recording rules + 6 alerts ; promtool check rules => SUCCESS: 18 rules found. - config/alertmanager/routes.yml : routing tree splits page-oncall (slack + PagerDuty) from ticket-oncall (slack only). - docs/runbooks/{api-availability,api-latency,payment-success}-slo-burn.md + db-failover, redis-down, disk-full, cert-expiring-soon : one stub per likely page. Each lists first moves under 5min + common causes. Acceptance (Day 10) : promtool check rules vert. Co-Authored-By: Claude Opus 4.7 (1M context) --- config/alertmanager/routes.yml | 96 +++++++++ config/prometheus/slo.yml | 224 +++++++++++++++++++++ docs/runbooks/api-availability-slo-burn.md | 62 ++++++ docs/runbooks/api-latency-slo-burn.md | 47 +++++ docs/runbooks/cert-expiring-soon.md | 80 ++++++++ docs/runbooks/db-failover.md | 108 ++++++++++ docs/runbooks/disk-full.md | 123 +++++++++++ docs/runbooks/payment-success-slo-burn.md | 86 ++++++++ docs/runbooks/redis-down.md | 83 ++++++++ 9 files changed, 909 insertions(+) create mode 100644 config/alertmanager/routes.yml create mode 100644 config/prometheus/slo.yml create mode 100644 docs/runbooks/api-availability-slo-burn.md create mode 100644 docs/runbooks/api-latency-slo-burn.md create mode 100644 docs/runbooks/cert-expiring-soon.md create mode 100644 docs/runbooks/db-failover.md create mode 100644 docs/runbooks/disk-full.md create mode 100644 docs/runbooks/payment-success-slo-burn.md create mode 100644 docs/runbooks/redis-down.md diff --git a/config/alertmanager/routes.yml b/config/alertmanager/routes.yml new file mode 100644 index 000000000..dff225c2c --- /dev/null +++ b/config/alertmanager/routes.yml @@ -0,0 +1,96 @@ +# Alertmanager routing tree (v1.0.9 W2 Day 10). +# +# Two channels : +# * page-oncall — slack #alerts-page + PagerDuty bridge. Wakes +# someone up. Reserved for severity=critical AND +# page=true. +# * ticket-oncall — slack #alerts-ticket. Files a ticket; investigate +# during business hours. +# +# Routing key = labels on the alert. The SLO rules in slo.yml set +# `page: "true"` on fast-burn alerts and `page: "false"` on slow-burn, +# so the burn-rate methodology and the routing tree stay coupled. +# +# This file is meant to be merged into the main alertmanager.yml +# (or stitched in via -config-file overrides). Keeping it separate +# makes it easy to diff and review the routing logic without +# touching receiver credentials. +route: + receiver: 'slack-default' + group_by: ['alertname', 'job', 'slo'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + + routes: + # Page-grade : critical + explicitly tagged page=true. + - matchers: + - severity = critical + - page = "true" + receiver: page-oncall + group_wait: 10s # page faster than the default 30s + repeat_interval: 30m # keep paging until ack'd + continue: false + + # Ticket-grade : warning OR critical-without-page. + - matchers: + - page = "false" + receiver: ticket-oncall + group_wait: 1m + repeat_interval: 12h + continue: false + + # Fallback : critical alerts without a page=… label still go to + # page-oncall. Better wake someone up for an unlabelled critical + # than silently route it to ticket. + - matchers: + - severity = critical + receiver: page-oncall + continue: false + +receivers: + - name: page-oncall + slack_configs: + - api_url: '${SLACK_WEBHOOK_URL_PAGE}' + channel: '#alerts-page' + send_resolved: true + title: '🚨 PAGE: {{ .GroupLabels.alertname }}' + text: | + {{ range .Alerts }} + *Severity:* {{ .Labels.severity }} + *SLO:* {{ .Labels.slo | default "n/a" }} + *Description:* {{ .Annotations.description }} + *Runbook:* {{ .Annotations.runbook_url }} + {{ end }} + # PagerDuty integration — populate routing_key from + # ${PAGERDUTY_ROUTING_KEY} once an account is provisioned. Until + # then the slack channel is the only page surface. + pagerduty_configs: + - routing_key: '${PAGERDUTY_ROUTING_KEY}' + severity: '{{ .GroupLabels.severity }}' + send_resolved: true + + - name: ticket-oncall + slack_configs: + - api_url: '${SLACK_WEBHOOK_URL_TICKET}' + channel: '#alerts-ticket' + send_resolved: true + title: '🎫 TICKET: {{ .GroupLabels.alertname }}' + text: | + {{ range .Alerts }} + *Severity:* {{ .Labels.severity }} + *SLO:* {{ .Labels.slo | default "n/a" }} + *Description:* {{ .Annotations.description }} + *Runbook:* {{ .Annotations.runbook_url }} + {{ end }} + + # slack-default kept as a no-op fallback so a misconfigured alert + # that escapes the route tree still hits a receiver — Alertmanager + # logs noise instead of silently dropping the alert. + - name: slack-default + slack_configs: + - api_url: '${SLACK_WEBHOOK_URL}' + channel: '#alerts' + send_resolved: true + title: '{{ .Status | toUpper }}: {{ .GroupLabels.alertname }}' + text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' diff --git a/config/prometheus/slo.yml b/config/prometheus/slo.yml new file mode 100644 index 000000000..6e3445db3 --- /dev/null +++ b/config/prometheus/slo.yml @@ -0,0 +1,224 @@ +# SLO definitions + multi-window burn-rate alerts (v1.0.9 W2 Day 10). +# +# Three SLOs, each with two alerts: +# * SLOFastBurn — page-grade: 2% of monthly error budget burned +# in 1h. Wakes someone up. +# * SLOSlowBurn — ticket-grade: 5% burned in 6h. Files a ticket. +# +# Multi-window methodology from the Google SRE workbook: +# - "Fast burn" = burn_rate > 14.4 over 1h (and confirmation over 5m) +# - "Slow burn" = burn_rate > 6 over 6h (and confirmation over 30m) +# +# burn_rate = error_rate / (1 - SLO_target). +# For SLO=99.5%: budget=0.5%; burn_rate=14.4 ⇒ error_rate=7.2%. +# For SLO=99.0% (latency): budget=1.0%; burn_rate=14.4 ⇒ error_rate=14.4%. +# +# All alerts carry a `runbook_url` annotation pointing into +# docs/runbooks/.md. Add the alert to that file's "What +# tripped me" section if you wake up to one. +groups: + # ---------------------------------------------------------------------- + # SLO 1 — API availability (read endpoints) + # target: 99.5% of read requests return non-5xx + # scope: /api/v1/health + GET endpoints (path label-matched) + # ---------------------------------------------------------------------- + - name: veza_slo_api_availability + interval: 30s + rules: + # Recording rules — express the SLO as ratios so alerts stay + # readable. Numerator = requests that violated the SLO (5xx). + # Denominator = total in-scope requests. The interval label is + # baked into the rule name so PromQL stays declarative. + - record: veza:slo_api_availability:burnrate_5m + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[5m])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[5m])) + - record: veza:slo_api_availability:burnrate_30m + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[30m])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[30m])) + - record: veza:slo_api_availability:burnrate_1h + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[1h])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[1h])) + - record: veza:slo_api_availability:burnrate_6h + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[6h])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[6h])) + + # Page — 2% of the monthly budget burned in 1h. + - alert: APIAvailabilitySLOFastBurn + expr: | + (veza:slo_api_availability:burnrate_1h > (14.4 * 0.005)) + and + (veza:slo_api_availability:burnrate_5m > (14.4 * 0.005)) + for: 2m + labels: + severity: critical + slo: api_availability + page: "true" + annotations: + summary: "API availability SLO: fast burn (2% budget in 1h)" + description: | + 5xx rate on read endpoints is burning the monthly error budget + 14.4× the steady-state rate. At this pace the entire monthly + budget is consumed in ~3.5 days. Investigate now. + runbook_url: "https://docs.veza.fr/runbooks/api-availability-slo-burn" + + # Ticket — 5% of monthly budget burned in 6h. Slower, but signals + # a degradation that won't fix itself. + - alert: APIAvailabilitySLOSlowBurn + expr: | + (veza:slo_api_availability:burnrate_6h > (6 * 0.005)) + and + (veza:slo_api_availability:burnrate_30m > (6 * 0.005)) + for: 15m + labels: + severity: warning + slo: api_availability + page: "false" + annotations: + summary: "API availability SLO: slow burn (5% budget in 6h)" + description: | + Read endpoints are degrading slowly — 6h burn rate is 6× the + steady-state pace. File a ticket to investigate during business hours. + runbook_url: "https://docs.veza.fr/runbooks/api-availability-slo-burn" + + # ---------------------------------------------------------------------- + # SLO 2 — API latency (write endpoints) + # target: 99% of write requests have p95 < 500ms + # scope: POST/PUT/PATCH/DELETE on /api/v1/* + # ---------------------------------------------------------------------- + - name: veza_slo_api_latency + interval: 30s + rules: + # "Bad" = requests slower than 500ms. We compute the ratio of slow + # requests to total directly from the histogram bucket boundaries. + # 0.5s bucket is le="0.5" — total minus that bucket = slow. + - record: veza:slo_api_latency:slow_ratio_5m + expr: | + 1 - ( + sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[5m])) + / + sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[5m])) + ) + - record: veza:slo_api_latency:slow_ratio_30m + expr: | + 1 - ( + sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[30m])) + / + sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[30m])) + ) + - record: veza:slo_api_latency:slow_ratio_1h + expr: | + 1 - ( + sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[1h])) + / + sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[1h])) + ) + - record: veza:slo_api_latency:slow_ratio_6h + expr: | + 1 - ( + sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[6h])) + / + sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[6h])) + ) + + - alert: APILatencySLOFastBurn + expr: | + (veza:slo_api_latency:slow_ratio_1h > (14.4 * 0.01)) + and + (veza:slo_api_latency:slow_ratio_5m > (14.4 * 0.01)) + for: 2m + labels: + severity: critical + slo: api_latency + page: "true" + annotations: + summary: "API latency SLO: fast burn (writes slow)" + description: | + More than 14.4% of write requests are taking > 500ms over the + last hour. Likely DB contention, lock storm, or a slow query. + runbook_url: "https://docs.veza.fr/runbooks/api-latency-slo-burn" + + - alert: APILatencySLOSlowBurn + expr: | + (veza:slo_api_latency:slow_ratio_6h > (6 * 0.01)) + and + (veza:slo_api_latency:slow_ratio_30m > (6 * 0.01)) + for: 15m + labels: + severity: warning + slo: api_latency + page: "false" + annotations: + summary: "API latency SLO: slow burn (writes slow)" + description: | + 6h slow ratio above 6%. Investigate during hours. + runbook_url: "https://docs.veza.fr/runbooks/api-latency-slo-burn" + + # ---------------------------------------------------------------------- + # SLO 3 — Payment success (POST /api/v1/orders → 201) + # target: 99.5% of order POSTs return 2xx + # ---------------------------------------------------------------------- + - name: veza_slo_payment_success + interval: 30s + rules: + - record: veza:slo_payment_success:burnrate_5m + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[5m])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[5m])) + - record: veza:slo_payment_success:burnrate_30m + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[30m])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[30m])) + - record: veza:slo_payment_success:burnrate_1h + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[1h])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[1h])) + - record: veza:slo_payment_success:burnrate_6h + expr: | + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[6h])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[6h])) + + - alert: PaymentSuccessSLOFastBurn + expr: | + (veza:slo_payment_success:burnrate_1h > (14.4 * 0.005)) + and + (veza:slo_payment_success:burnrate_5m > (14.4 * 0.005)) + for: 2m + labels: + severity: critical + slo: payment_success + page: "true" + annotations: + summary: "Payment success SLO: fast burn (orders failing)" + description: | + > 7% of POST /api/v1/orders are non-2xx in the last hour. + Hyperswitch or marketplace pipeline failure — every minute + is a customer charged but no license issued (or vice versa). + runbook_url: "https://docs.veza.fr/runbooks/payment-success-slo-burn" + + - alert: PaymentSuccessSLOSlowBurn + expr: | + (veza:slo_payment_success:burnrate_6h > (6 * 0.005)) + and + (veza:slo_payment_success:burnrate_30m > (6 * 0.005)) + for: 15m + labels: + severity: warning + slo: payment_success + page: "false" + annotations: + summary: "Payment success SLO: slow burn (orders failing)" + description: | + 6h burn rate > 6× — file a ticket, investigate during hours. + runbook_url: "https://docs.veza.fr/runbooks/payment-success-slo-burn" diff --git a/docs/runbooks/api-availability-slo-burn.md b/docs/runbooks/api-availability-slo-burn.md new file mode 100644 index 000000000..82dab819d --- /dev/null +++ b/docs/runbooks/api-availability-slo-burn.md @@ -0,0 +1,62 @@ +# Runbook — API availability SLO burn + +> **SLO** : 99.5% of GET requests on `/api/v1/*` return non-5xx (monthly window). +> **Alerts** : `APIAvailabilitySLOFastBurn` (page) · `APIAvailabilitySLOSlowBurn` (ticket) +> **Owner** : backend on-call. + +## What tripped me + +The 5xx ratio on read endpoints is consuming the monthly error budget faster than the steady-state rate allows : + +- **Fast burn** (`page=true`) : 14.4× over 1h ⇒ entire monthly budget gone in ~3.5 days. +- **Slow burn** (`page=false`) : 6× over 6h ⇒ entire budget gone in ~7 days. + +## First moves (under 5 minutes) + +1. **Confirm the alert is real**, not a metric-pipeline glitch : + ```bash + # Live error rate on the GETs we measure : + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[5m])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[5m])) + ``` + Open Grafana → "Veza API Overview" dashboard, panel "Request rate by path". + +2. **Identify the affected endpoint**. The fastest pivot : + ```promql + topk(5, sum by (path, status) ( + rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[5m]) + )) + ``` + +3. **Drop into traces**. Open the "Veza Service Map (Tempo)" dashboard and filter the slowest-spans table for the offending path. If the failures correlate with one downstream (Redis, Postgres, Hyperswitch), the trace will show it. + +## Common causes + +| Symptom | Likely cause | Fix | +| -------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------- | +| 5xx concentrated on `/feed`, `/library` | Postgres slow / connection pool exhausted | See `db-failover.md` — check `pg_auto_failover` state | +| 5xx concentrated on `/search`, `/tracks` | Postgres FTS index churn or autovacuum holding row locks | `SELECT pid, query FROM pg_stat_activity WHERE state='active' ORDER BY xact_start LIMIT 5;` | +| 5xx across all paths, sudden | Pod just restarted / migration broken / DB unreachable | `kubectl get pods -n veza` or `systemctl status veza-backend-api` | +| 5xx slowly climbing | Memory leak; container approaching OOMKill | `kubectl top pod -n veza` and bounce the leaking pod | +| 5xx confined to one instance | Single bad replica (config, certs, networking) | Drain that instance from the load balancer | + +## If the page is real + +1. **Page the secondary on-call** if the primary fix is going to take > 15 min. +2. **Update the status page** (`status.veza.fr`) with "Investigating elevated error rates." +3. **Post in #incident-response** with the alert link + first hypothesis. + +## When to silence + +- Confirmed degradation is a known maintenance window already announced : silence for the maintenance window's duration. +- Single-instance issue, that instance has been drained : silence for 1h. +- Otherwise, **do not silence** — let the alert keep firing until the burn rate drops below threshold naturally. + +## Recovery verification + +After mitigation, both burn-rate windows must drop below threshold for the alert to clear (1h and 5m for fast burn, 6h and 30m for slow burn). The 6h window means the slow-burn alert can stay green for hours after the issue is fixed — don't be surprised. + +## Postmortem trigger + +A page-grade alert that fires for > 15 minutes triggers a postmortem doc (`docs/postmortems/YYYY-MM-DD-.md`). Include the timeline, the trace IDs, and the metric query screenshots. diff --git a/docs/runbooks/api-latency-slo-burn.md b/docs/runbooks/api-latency-slo-burn.md new file mode 100644 index 000000000..f9da5f3ca --- /dev/null +++ b/docs/runbooks/api-latency-slo-burn.md @@ -0,0 +1,47 @@ +# Runbook — API latency SLO burn + +> **SLO** : 99% of write requests (POST/PUT/PATCH/DELETE) return in < 500ms p95 (monthly window). +> **Alerts** : `APILatencySLOFastBurn` (page) · `APILatencySLOSlowBurn` (ticket) +> **Owner** : backend on-call. + +## What tripped me + +Writes are taking longer than 500ms p95. The fast burn fires when > 14.4% of writes are slow over 1h. + +## First moves (under 5 minutes) + +1. **Identify the slow endpoints** : + ```promql + topk(5, histogram_quantile(0.95, + sum by (path, le) (rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[5m])) + )) + ``` +2. **Open Tempo service-map dashboard** ("Veza Service Map (Tempo)") and check the slow-spans table for the same paths. + +## Common causes + +| Symptom | Likely cause | Pointer | +| ------------------------------------------------ | -------------------------------------------------- | ----------------------------- | +| Slow on `/api/v1/orders` (POST) | Hyperswitch upstream latency | `payment-success-slo-burn.md` | +| Slow on `/api/v1/tracks` (POST) | S3 multipart pre-sign / commit latency | Check MinIO health | +| Slow across all writes | Postgres lock contention / autovacuum | `db-failover.md` §autovacuum | +| Slow only on one host | One bad replica (CPU starvation, disk) | Drain & investigate | +| Slow + DB pool exhausted in logs | A slow query holding the pool | `db-failover.md` §pool | + +## Mitigation + +- If Hyperswitch : nothing to do but wait + status-page banner. +- If DB lock contention : `pg_blocking_pids()` + cancel the offender : + ```sql + SELECT pg_cancel_backend(pid) FROM pg_stat_activity + WHERE state = 'active' AND xact_start < now() - INTERVAL '30 seconds'; + ``` +- If a single bad replica : drain it from the LB and investigate offline. + +## Recovery + +The slow-burn alert can take 6h to clear after a fix. Don't silence — let it ride down. + +## Postmortem trigger + +Same threshold as the availability runbook — fast burn > 15 min triggers a postmortem. diff --git a/docs/runbooks/cert-expiring-soon.md b/docs/runbooks/cert-expiring-soon.md new file mode 100644 index 000000000..e47a80e6a --- /dev/null +++ b/docs/runbooks/cert-expiring-soon.md @@ -0,0 +1,80 @@ +# Runbook — TLS certificate expiring soon + +> **Alert** : `CertExpiringSoon` (warning at 30d, critical at 7d). +> **Owner** : infra on-call. + +## Cert inventory + +| Domain | Issuer | Auto-renew ? | Where it lives | +| -------------------------- | ----------------- | ---------------------- | ---------------------------------------- | +| `api.veza.fr` | Let's Encrypt | Yes (Caddy) | Caddy data dir on the prod LB | +| `app.veza.fr` | Let's Encrypt | Yes (Caddy) | Caddy data dir on the prod LB | +| `staging.veza.fr` + sub | Let's Encrypt | Yes (Caddy) | Staging Caddy | +| `*.lxd` (internal) | self-signed | No — manually rotated | Each container's `/etc/ssl/...` | +| `jwt-private.pem` / public | self-generated | No — rotated yearly | Backend host (mounted via volume) | +| `pgaf-*.veza.lxd` | self-signed | No — rotated yearly | pg_auto_failover pki dir | + +The alert fires for the **public-facing** certs above. Internal `.lxd` certs are tracked separately by a yearly calendar reminder. + +## Auto-renewing certs (Let's Encrypt via Caddy) + +Caddy renews 30 days before expiry. If the alert fires at 30d, that's the renewal window starting — confirm the renewal is happening : + +```bash +# On the LB host : +sudo journalctl -u caddy --since "1 day ago" | grep -i "obtain\|renew\|cert" + +# Caddy's internal state : +sudo curl -fsS http://localhost:2019/config/ | jq '.apps.tls.automation' +``` + +If renewal is failing : + +1. **Rate-limit** : Let's Encrypt has a 5-attempt-per-hour limit per cert. Check Caddy log for `429 Too Many Requests`. +2. **DNS not pointing here** : `dig +short api.veza.fr` must point at this LB. +3. **Port 80 blocked** : ACME HTTP-01 challenge needs port 80. `sudo ss -lntp | grep ':80'` should show Caddy. +4. **Disk full** : Caddy writes the new cert to disk before swapping. See `disk-full.md`. + +## Self-signed `.lxd` certs + +These rotate on a yearly cadence (calendar reminder, not automated). When the alert fires : + +```bash +# Inspect a cert : +echo | openssl s_client -connect minio.lxd:9000 -servername minio.lxd 2>/dev/null | openssl x509 -noout -dates -subject + +# Regenerate (one-shot for self-signed CA + leaf) : +cd infra/pki/lab +./regenerate-cert.sh minio.lxd +# Then push to the container : +incus file push minio.crt minio:/etc/ssl/certs/minio.crt +incus file push minio.key minio:/etc/ssl/private/minio.key +incus exec minio -- systemctl reload minio +``` + +(Script TODO — currently the rotation is manual openssl. W4 backlog.) + +## JWT keys + +`jwt-private.pem` / `jwt-public.pem` are RSA keys, not X.509. They don't "expire" but are rotated yearly. Procedure : + +1. Generate a new pair : + ```bash + ./scripts/generate-jwt-keys.sh + ``` +2. Roll the public key first (backend trusts new + old) — current code only loads one ; needs a small extension. **Tracked as v1.1 work.** +3. Until that's wired, rotation = downtime window where every existing access token becomes invalid (5 min lifetime mitigates this). + +## After rotation + +1. Hit a public endpoint and confirm the new cert is served : + ```bash + echo | openssl s_client -connect api.veza.fr:443 2>/dev/null | openssl x509 -noout -dates + ``` +2. The `CertExpiringSoon` alert clears within one Prometheus scrape interval (~30s) once `probe_ssl_earliest_cert_expiry` is updated by blackbox-exporter. +3. If the cert was rotated under fire (renewal hit a wall, manual replacement), file a postmortem with the timeline. + +## What CAN break + +- Pinned certs in the mobile app (none today, but keep this in mind for v2+). +- Customer integrations that fetched our public key once and cached it — JWT public key rotation will reject their cached signatures. Until v1.1 we don't promise stable JWT keys to third parties. diff --git a/docs/runbooks/db-failover.md b/docs/runbooks/db-failover.md new file mode 100644 index 000000000..e91251a39 --- /dev/null +++ b/docs/runbooks/db-failover.md @@ -0,0 +1,108 @@ +# Runbook — Postgres failover (`pg_auto_failover`) + +> **Alerts** : `PostgresPrimaryUnreachable`, `PostgresReplicationLagHigh` · also reached from `api-availability-slo-burn.md` and `api-latency-slo-burn.md`. +> **Owner** : infra on-call. + +## Topology recap + +``` +┌─────────────────┐ +│ pgaf-monitor │ ← state machine; assigns primary/standby roles +└────────┬────────┘ + │ pg_auto_failover protocol + │ + ┌─────┴─────┐ + │ │ +┌──▼───┐ ┌───▼────┐ +│ pgaf-│ │ pgaf- │ +│primary│ │replica │ +└───────┘ └────────┘ +``` + +PgBouncer (`pgaf-pgbouncer`, port 6432) sits in front of whoever is currently primary. Backend reads `DATABASE_URL` from env that already points at the bouncer. + +## What "failover" looks like + +- Primary disappears (crash, host reboot, manual `incus stop`). +- Monitor notices within `pgaf_health_check_interval` (~10s). +- After `pgaf_failover_timeout` (60s), monitor promotes the replica to primary. +- PgBouncer is reconfigured by the monitor's notify hook ; new connections go to the new primary. + +**Expected RTO is ~60 seconds.** RPO ≈ 0 if synchronous replication was caught up; up to one tx if async. + +## Diagnose state + +```bash +# From any node : +sudo -u postgres pg_autoctl show state + +# Look for one node with state="primary" and one with state="secondary". +# If both are "wait_for_primary" the formation is wedged. + +# Connection-level test (does the bouncer route to a live primary?) : +psql "$DATABASE_URL" -c "SELECT now(), pg_is_in_recovery();" +# pg_is_in_recovery = false ⇒ you're hitting the primary +``` + +## Common failure modes + +### A. Monitor is up, primary is down, replica didn't get promoted + +Either `pgaf_failover_timeout` hasn't elapsed yet (wait 60s) **or** the replica is too far behind to be safe. + +```bash +# On the replica : +sudo -u postgres pg_autoctl show state +# Check the LSN distance — if it's > 1MB the monitor will refuse promote. +``` + +If monitor refused, manual promotion (only if you accept potential data loss) : + +```bash +sudo -u postgres pg_autoctl perform failover --formation default --group 0 +``` + +### B. Monitor itself is down + +The data nodes keep serving their last-known role until the monitor returns. Reads keep working from the standby. **No automatic failover happens** without the monitor — start it before doing anything else. + +```bash +sudo systemctl start pg_autoctl@monitor +sudo journalctl -u pg_autoctl@monitor -n 200 --no-pager +``` + +### C. Both data nodes are down (catastrophe) + +Restore from pgBackRest. See the dr-drill runbook in `docs/archive/` (or the `pgbackrest` role README) for the manual procedure. **Estimated RTO ~30 min** with a full+diff already on MinIO. + +## Connection routing + +PgBouncer holds the routing decision, so during a failover : + +```bash +# Confirm which Postgres backend is currently behind the bouncer : +psql -h pgaf-pgbouncer.lxd -p 6432 -U pgbouncer pgbouncer -c "SHOW SERVERS;" +``` + +If the bouncer is still pointing at the dead primary : + +```bash +# Reload the bouncer config (the pg_auto_failover monitor's +# `host_change_hook.sh` should have done this automatically — if not, +# something is broken) : +sudo systemctl reload pgbouncer +``` + +## Backend behavior during failover + +The backend's GORM connection pool drops dead connections lazily. Expect a few hundred 5xx during the 30-60s window — this trips `APIAvailabilitySLOFastBurn`. The alert clears once the pool refills. + +## After recovery + +1. Re-add the failed node as standby : + ```bash + sudo -u postgres pg_autoctl create postgres ... + ``` +2. Wait for `pg_autoctl show state` to show two healthy nodes. +3. Run the next dr-drill cycle to validate backups against the new primary. +4. Postmortem if downtime > 5 min. diff --git a/docs/runbooks/disk-full.md b/docs/runbooks/disk-full.md new file mode 100644 index 000000000..9acadd2c9 --- /dev/null +++ b/docs/runbooks/disk-full.md @@ -0,0 +1,123 @@ +# Runbook — Disk full / `/` filesystem at threshold + +> **Alerts** : `DiskSpaceLow` (warning at 85%) · `DiskSpaceCritical` (page at 95%). +> **Owner** : infra on-call. + +## Hosts to watch + +| Host | What fills the disk | +| --------------------- | -------------------------------------------------------------------- | +| `pgaf-primary` | WAL + autovacuum bloat. WAL fills if pgBackRest archive falls behind. | +| `pgaf-replica` | Replication lag → WAL not replayed; same WAL accumulation. | +| `pgaf-pgbouncer` | Logs in `/var/log/postgresql/pgbouncer.log` if log_disconnections=on. | +| `tempo` | Trace blocks under `/var/lib/tempo`. Default retention 14d. | +| `otel-collector` | Almost never — no on-disk state by default. | +| API/web hosts (k8s) | Container images, log rotation, build caches. | +| `minio-*` | Object data — lifecycle policy supposed to manage this. | + +## First moves (under 2 minutes) + +```bash +df -h +# Identify the mount that's tight, then : +sudo du -h --max-depth=2 -x /var/lib | sort -hr | head -20 +sudo du -h --max-depth=2 -x /var/log | sort -hr | head -20 +``` + +## Postgres data nodes (`pgaf-*`) + +### A. WAL piling up + +If `/var/lib/postgresql/16/main/pg_wal` is the offender : + +```bash +# Is pgBackRest shipping ? +sudo -u postgres pgbackrest --stanza=veza info | tail -20 + +# Last WAL push time should be < 1 minute ago. +``` + +If pgBackRest is stuck (S3 unreachable, credentials rotated) : + +1. **Don't** force `pg_resetwal` — that's data loss. +2. Fix the upstream (network, credentials), then push pending WAL : + ```bash + sudo -u postgres pgbackrest --stanza=veza archive-push + ``` + +### B. Autovacuum bloat + +```bash +sudo -u postgres psql -c " + SELECT relname, n_live_tup, n_dead_tup, + pg_size_pretty(pg_total_relation_size(relid)) AS size + FROM pg_stat_user_tables + ORDER BY n_dead_tup DESC LIMIT 10; +" +``` + +Manual vacuum on the worst offender : + +```bash +sudo -u postgres psql -c "VACUUM (VERBOSE, ANALYZE) ;" +# Or VACUUM FULL if you have the downtime — it rewrites the table. +``` + +## Tempo host + +Trace blocks default to 14d retention. If the host is full anyway, the lifecycle compactor isn't keeping up : + +```bash +sudo systemctl status tempo +sudo journalctl -u tempo -n 200 --no-pager | grep -i compact +``` + +Emergency recovery — drop oldest blocks manually : + +```bash +sudo -u tempo find /var/lib/tempo/blocks -mindepth 1 -maxdepth 1 -type d -mtime +14 -exec rm -rf {} + +``` + +(This is safe because the blocks are write-once, append-only ; the index in `wal/` is rebuilt at restart.) + +## API/web hosts (Kubernetes) + +```bash +# Images : +kubectl describe node | grep -A 5 "Allocated resources" + +# Container logs (rotation should be handling this — check): +sudo du -h --max-depth=1 /var/log/pods | sort -hr | head -10 + +# If a single pod is logging GB/min, that's a regression. Restart it +# and grep its previous logs for the loop signature. +``` + +## MinIO + +If the storage bucket is full : + +```bash +mc admin info veza-minio +mc du veza-minio/ --depth=2 +``` + +Check the lifecycle policy is applied : + +```bash +mc ilm rule list veza-minio/veza-tracks +``` + +## Recovery verification + +Once free space is back : + +- Postgres : confirm `pg_wal` size is bounded (should be < `wal_keep_size` + ~ 64MB). +- Tempo : `df -h /var/lib/tempo` is below 70%. +- The disk-space alert clears within one Prometheus scrape interval (~ 30s). + +## Long-term prevention + +- pgBackRest archive lag → fix the alert (currently only `BackupRestoreDrillStale`, doesn't catch this) ; W3 backlog. +- Tempo retention spilling → migrate Tempo to S3-backed (`tempo_storage_backend: s3`). W3 day 12 covers this. +- API log volume → tighten log levels in prod (`LOG_LEVEL=INFO`). diff --git a/docs/runbooks/payment-success-slo-burn.md b/docs/runbooks/payment-success-slo-burn.md new file mode 100644 index 000000000..e2f5a9b4d --- /dev/null +++ b/docs/runbooks/payment-success-slo-burn.md @@ -0,0 +1,86 @@ +# Runbook — Payment success SLO burn + +> **SLO** : 99.5% of `POST /api/v1/orders` return 2xx (monthly window). +> **Alerts** : `PaymentSuccessSLOFastBurn` (page) · `PaymentSuccessSLOSlowBurn` (ticket) +> **Owner** : payments on-call (rotates with backend on-call until v2.0). + +## Why this is critical + +A failing checkout means **money lost** (charged customer, no license issued) or **money taken twice** (double-submitted on retry). Worst-case fraud window is the time it takes to roll the upstream change. Treat fast-burn here like a Sev-1 incident. + +## First moves (under 5 minutes) + +1. **Confirm the alert** : + ```promql + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[5m])) + / + sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[5m])) + ``` + +2. **Pivot on status code** : + ```promql + sum by (status) (rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[5m])) + ``` + - Spike in **502/503** → Hyperswitch unreachable. See "Hyperswitch outage" below. + - Spike in **400** → marketplace validation failing. New deploy regressed something — check recent commits to `internal/core/marketplace/`. + - Spike in **500** → DB / connection / panic. Check logs for stack traces. + +3. **Trace pivot**. "Veza Service Map (Tempo)" → filter `payment.webhook` for `status=error` recent spans. + +## Hyperswitch outage + +If Hyperswitch is the upstream culprit : + +```bash +# Check Hyperswitch's own status : +curl -fsS https://api.hyperswitch.io/health + +# Check the last successful webhook landing : +psql "$DATABASE_URL" -c " + SELECT id, hyperswitch_payment_id, status, payment_status, updated_at + FROM orders + WHERE updated_at > NOW() - INTERVAL '15 minutes' + ORDER BY updated_at DESC LIMIT 10; +" +``` + +If they're all stuck in `payment_status=pending`, Hyperswitch is silently dropping our webhooks. Engage their support **and** queue a manual reconciliation pass once they're back : + +```bash +# Manual reconciliation script (still TODO — tracked in W4 day 17) : +go run ./cmd/tools/reconcile_orders --since=15m +``` + +## DB / pool exhaustion + +If the failures are 500s and the API logs show `pq: too many connections` or `context deadline exceeded` : + +1. Check pgbouncer queue length : + ```bash + psql -h pgaf-pgbouncer.lxd -p 6432 -U pgbouncer pgbouncer -c "SHOW POOLS;" + ``` +2. If `cl_waiting > 0` consistently, a slow query is holding pool slots — see `db-failover.md` for finding it. +3. Last resort : restart the backend pod to drop in-flight requests (loses idempotency on retried requests; only do this if Hyperswitch is *not* in flight on those orders). + +## Recovery verification + +After fix : + +```bash +# Most recent 10 orders should be `completed` or `pending` (not `failed`) : +psql "$DATABASE_URL" -c " + SELECT status, COUNT(*) FROM orders + WHERE created_at > NOW() - INTERVAL '5 minutes' + GROUP BY status; +" +``` + +The slow-burn window (6h) takes hours to clear after recovery. Don't silence — wait for the metric. + +## Reconciliation post-incident + +Every fast-burn incident requires a reconciliation pass within 24h : + +1. Pull the list of `orders` with `payment_status='pending'` older than 30 minutes. +2. For each, query Hyperswitch directly via `GET /payments/{payment_id}` and update. +3. File a postmortem with the count of mismatches resolved. diff --git a/docs/runbooks/redis-down.md b/docs/runbooks/redis-down.md new file mode 100644 index 000000000..212984928 --- /dev/null +++ b/docs/runbooks/redis-down.md @@ -0,0 +1,83 @@ +# Runbook — Redis unavailable + +> **Alert** : `RedisUnreachable` (existing, in `alert_rules.yml`). +> **Owner** : infra on-call. + +## What breaks when Redis is down + +Veza uses Redis for several distinct concerns ; the impact differs by callsite. + +| Subsystem | Effect when Redis is gone | Severity | +| --------------------------------- | --------------------------------------------------- | -------- | +| Session storage / refresh tokens | Login/refresh fail — users log out on next request | **HIGH** — most users notice within minutes | +| Rate limiter (`UserRateLimiter`) | Fails-open — requests stop being rate-limited | MEDIUM — capacity risk if Redis stays down for hours | +| JWT revocation | Revoked tokens accepted again | **SECURITY** — silent failure, no user-visible signal | +| Cache (track lookups, feed pages) | Slow but works — falls back to Postgres | LOW — surfaces as elevated p95 | +| Queue (RabbitMQ-fronted jobs) | Independent — RabbitMQ is the queue, Redis is just metrics | NONE | + +## First moves + +1. **Confirm Redis is actually down**, not "just unreachable from one host" : + ```bash + redis-cli -h redis.lxd ping + ``` +2. If it's a single-host issue, skip ahead to "Backend can't reach Redis" below. + +## Redis instance is down + +```bash +# Check the systemd state on whichever host owns Redis : +sudo systemctl status redis + +# If "failed", inspect logs : +sudo journalctl -u redis -n 200 --no-pager + +# Disk full ? Dump dir is /var/lib/redis : +df -h /var/lib/redis +``` + +Common causes : + +- **OOM-killed by RDB snapshot.** `maxmemory` reached, no eviction policy, snapshot fork doubled the RSS. Set `maxmemory-policy allkeys-lru` and bump `maxmemory`. +- **Disk full.** AOF or RDB filling `/var/lib/redis`. Truncate AOF (`BGREWRITEAOF`) or move the dir. +- **Process crashed.** Bring it back up : `sudo systemctl restart redis`. + +## Backend can't reach Redis + +Network/DNS issue, not a Redis fault. Check : + +```bash +# From the API container : +nc -zv redis.lxd 6379 + +# DNS resolution : +getent hosts redis.lxd +``` + +Likely culprits : Incus bridge restart, security group change on the API host, stale DNS cache. + +## Mitigation while Redis is down + +The backend's `internal/cache/redis_cache.go` already has fallback logic for the cache path. The session and rate-limiter paths fail loud. If recovery is going to take > 5 min : + +1. **Drain new logins** by surfacing a maintenance banner on the frontend : flip `MAINTENANCE_MODE=true` in the API env and restart. (existing — set in `internal/middleware/maintenance.go`). +2. **Do NOT drop the rate limiter to "always allow"** — temporarily switch it to "always deny" via env (`RATELIMIT_FAIL_CLOSED=true`) so abuse can't ride the outage. + +## Recovery + +Once Redis is back : + +1. Verify connectivity from each backend instance : + ```bash + docker exec veza-backend-api redis-cli -u "$REDIS_URL" ping + ``` +2. Existing sessions stay valid — refresh tokens were lost, but access tokens (5 min lifetime) keep working until expiry. Users will be prompted to log in again as their access tokens roll over. +3. Cache is cold — the next 5-15 min of traffic hits Postgres harder. Monitor "Veza API Overview" → "p95 latency" panel. + +## Postmortem trigger + +Any Redis outage > 10 min triggers a postmortem. The session loss UX is bad; we want to know the time-to-detect and time-to-recover. + +## Future-proofing + +Redis Sentinel HA is **W3 day 11** on the launch roadmap. Once that's in, this runbook's "instance is down" section reduces to "the failover happened, verify the new master."