# SLO definitions + multi-window burn-rate alerts (v1.0.9 W2 Day 10). # # Three SLOs, each with two alerts: # * SLOFastBurn — page-grade: 2% of monthly error budget burned # in 1h. Wakes someone up. # * SLOSlowBurn — ticket-grade: 5% burned in 6h. Files a ticket. # # Multi-window methodology from the Google SRE workbook: # - "Fast burn" = burn_rate > 14.4 over 1h (and confirmation over 5m) # - "Slow burn" = burn_rate > 6 over 6h (and confirmation over 30m) # # burn_rate = error_rate / (1 - SLO_target). # For SLO=99.5%: budget=0.5%; burn_rate=14.4 ⇒ error_rate=7.2%. # For SLO=99.0% (latency): budget=1.0%; burn_rate=14.4 ⇒ error_rate=14.4%. # # All alerts carry a `runbook_url` annotation pointing into # docs/runbooks/.md. Add the alert to that file's "What # tripped me" section if you wake up to one. groups: # ---------------------------------------------------------------------- # SLO 1 — API availability (read endpoints) # target: 99.5% of read requests return non-5xx # scope: /api/v1/health + GET endpoints (path label-matched) # ---------------------------------------------------------------------- - name: veza_slo_api_availability interval: 30s rules: # Recording rules — express the SLO as ratios so alerts stay # readable. Numerator = requests that violated the SLO (5xx). # Denominator = total in-scope requests. The interval label is # baked into the rule name so PromQL stays declarative. - record: veza:slo_api_availability:burnrate_5m expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[5m])) - record: veza:slo_api_availability:burnrate_30m expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[30m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[30m])) - record: veza:slo_api_availability:burnrate_1h expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[1h])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[1h])) - record: veza:slo_api_availability:burnrate_6h expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[6h])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[6h])) # Page — 2% of the monthly budget burned in 1h. - alert: APIAvailabilitySLOFastBurn expr: | (veza:slo_api_availability:burnrate_1h > (14.4 * 0.005)) and (veza:slo_api_availability:burnrate_5m > (14.4 * 0.005)) for: 2m labels: severity: critical slo: api_availability page: "true" annotations: summary: "API availability SLO: fast burn (2% budget in 1h)" description: | 5xx rate on read endpoints is burning the monthly error budget 14.4× the steady-state rate. At this pace the entire monthly budget is consumed in ~3.5 days. Investigate now. runbook_url: "https://docs.veza.fr/runbooks/api-availability-slo-burn" # Ticket — 5% of monthly budget burned in 6h. Slower, but signals # a degradation that won't fix itself. - alert: APIAvailabilitySLOSlowBurn expr: | (veza:slo_api_availability:burnrate_6h > (6 * 0.005)) and (veza:slo_api_availability:burnrate_30m > (6 * 0.005)) for: 15m labels: severity: warning slo: api_availability page: "false" annotations: summary: "API availability SLO: slow burn (5% budget in 6h)" description: | Read endpoints are degrading slowly — 6h burn rate is 6× the steady-state pace. File a ticket to investigate during business hours. runbook_url: "https://docs.veza.fr/runbooks/api-availability-slo-burn" # ---------------------------------------------------------------------- # SLO 2 — API latency (write endpoints) # target: 99% of write requests have p95 < 500ms # scope: POST/PUT/PATCH/DELETE on /api/v1/* # ---------------------------------------------------------------------- - name: veza_slo_api_latency interval: 30s rules: # "Bad" = requests slower than 500ms. We compute the ratio of slow # requests to total directly from the histogram bucket boundaries. # 0.5s bucket is le="0.5" — total minus that bucket = slow. - record: veza:slo_api_latency:slow_ratio_5m expr: | 1 - ( sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[5m])) / sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[5m])) ) - record: veza:slo_api_latency:slow_ratio_30m expr: | 1 - ( sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[30m])) / sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[30m])) ) - record: veza:slo_api_latency:slow_ratio_1h expr: | 1 - ( sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[1h])) / sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[1h])) ) - record: veza:slo_api_latency:slow_ratio_6h expr: | 1 - ( sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[6h])) / sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[6h])) ) - alert: APILatencySLOFastBurn expr: | (veza:slo_api_latency:slow_ratio_1h > (14.4 * 0.01)) and (veza:slo_api_latency:slow_ratio_5m > (14.4 * 0.01)) for: 2m labels: severity: critical slo: api_latency page: "true" annotations: summary: "API latency SLO: fast burn (writes slow)" description: | More than 14.4% of write requests are taking > 500ms over the last hour. Likely DB contention, lock storm, or a slow query. runbook_url: "https://docs.veza.fr/runbooks/api-latency-slo-burn" - alert: APILatencySLOSlowBurn expr: | (veza:slo_api_latency:slow_ratio_6h > (6 * 0.01)) and (veza:slo_api_latency:slow_ratio_30m > (6 * 0.01)) for: 15m labels: severity: warning slo: api_latency page: "false" annotations: summary: "API latency SLO: slow burn (writes slow)" description: | 6h slow ratio above 6%. Investigate during hours. runbook_url: "https://docs.veza.fr/runbooks/api-latency-slo-burn" # ---------------------------------------------------------------------- # SLO 3 — Payment success (POST /api/v1/orders → 201) # target: 99.5% of order POSTs return 2xx # ---------------------------------------------------------------------- - name: veza_slo_payment_success interval: 30s rules: - record: veza:slo_payment_success:burnrate_5m expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[5m])) - record: veza:slo_payment_success:burnrate_30m expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[30m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[30m])) - record: veza:slo_payment_success:burnrate_1h expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[1h])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[1h])) - record: veza:slo_payment_success:burnrate_6h expr: | sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[6h])) / sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[6h])) - alert: PaymentSuccessSLOFastBurn expr: | (veza:slo_payment_success:burnrate_1h > (14.4 * 0.005)) and (veza:slo_payment_success:burnrate_5m > (14.4 * 0.005)) for: 2m labels: severity: critical slo: payment_success page: "true" annotations: summary: "Payment success SLO: fast burn (orders failing)" description: | > 7% of POST /api/v1/orders are non-2xx in the last hour. Hyperswitch or marketplace pipeline failure — every minute is a customer charged but no license issued (or vice versa). runbook_url: "https://docs.veza.fr/runbooks/payment-success-slo-burn" - alert: PaymentSuccessSLOSlowBurn expr: | (veza:slo_payment_success:burnrate_6h > (6 * 0.005)) and (veza:slo_payment_success:burnrate_30m > (6 * 0.005)) for: 15m labels: severity: warning slo: payment_success page: "false" annotations: summary: "Payment success SLO: slow burn (orders failing)" description: | 6h burn rate > 6× — file a ticket, investigate during hours. runbook_url: "https://docs.veza.fr/runbooks/payment-success-slo-burn"