veza/config/prometheus/slo.yml

# SLO definitions + multi-window burn-rate alerts (v1.0.9 W2 Day 10).
#
# Three SLOs, each with two alerts:
#   * <name>SLOFastBurn  — page-grade: 2% of monthly error budget burned
#                          in 1h. Wakes someone up.
#   * <name>SLOSlowBurn  — ticket-grade: 5% burned in 6h. Files a ticket.
#
# Multi-window methodology from the Google SRE workbook:
#   - "Fast burn" = burn_rate > 14.4 over 1h (and confirmation over 5m)
#   - "Slow burn" = burn_rate > 6   over 6h (and confirmation over 30m)
#
# burn_rate = error_rate / (1 - SLO_target).
# For SLO=99.5%: budget=0.5%; burn_rate=14.4 ⇒ error_rate=7.2%.
# For SLO=99.0% (latency): budget=1.0%; burn_rate=14.4 ⇒ error_rate=14.4%.
#
# All alerts carry a `runbook_url` annotation pointing into
# docs/runbooks/<alert-name>.md. Add the alert to that file's "What
# tripped me" section if you wake up to one.
groups:
  # ----------------------------------------------------------------------
  # SLO 1 — API availability (read endpoints)
  #   target: 99.5% of read requests return non-5xx
  #   scope:  /api/v1/health + GET endpoints (path label-matched)
  # ----------------------------------------------------------------------
  - name: veza_slo_api_availability
    interval: 30s
    rules:
      # Recording rules — express the SLO as ratios so alerts stay
      # readable. Numerator = requests that violated the SLO (5xx).
      # Denominator = total in-scope requests. The interval label is
      # baked into the rule name so PromQL stays declarative.
      - record: veza:slo_api_availability:burnrate_5m
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[5m]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[5m]))
      - record: veza:slo_api_availability:burnrate_30m
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[30m]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[30m]))
      - record: veza:slo_api_availability:burnrate_1h
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[1h]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[1h]))
      - record: veza:slo_api_availability:burnrate_6h
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET",status=~"5.."}[6h]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="GET"}[6h]))

      # Page — 2% of the monthly budget burned in 1h.
      - alert: APIAvailabilitySLOFastBurn
        expr: |
          (veza:slo_api_availability:burnrate_1h > (14.4 * 0.005))
          and
          (veza:slo_api_availability:burnrate_5m > (14.4 * 0.005))
        for: 2m
        labels:
          severity: critical
          slo: api_availability
          page: "true"
        annotations:
          summary: "API availability SLO: fast burn (2% budget in 1h)"
          description: |
            5xx rate on read endpoints is burning the monthly error budget
            14.4× the steady-state rate. At this pace the entire monthly
            budget is consumed in ~3.5 days. Investigate now.
          runbook_url: "https://docs.veza.fr/runbooks/api-availability-slo-burn"

      # Ticket — 5% of monthly budget burned in 6h. Slower, but signals
      # a degradation that won't fix itself.
      - alert: APIAvailabilitySLOSlowBurn
        expr: |
          (veza:slo_api_availability:burnrate_6h > (6 * 0.005))
          and
          (veza:slo_api_availability:burnrate_30m > (6 * 0.005))
        for: 15m
        labels:
          severity: warning
          slo: api_availability
          page: "false"
        annotations:
          summary: "API availability SLO: slow burn (5% budget in 6h)"
          description: |
            Read endpoints are degrading slowly — 6h burn rate is 6× the
            steady-state pace. File a ticket to investigate during business hours.
          runbook_url: "https://docs.veza.fr/runbooks/api-availability-slo-burn"

  # ----------------------------------------------------------------------
  # SLO 2 — API latency (write endpoints)
  #   target: 99% of write requests have p95 < 500ms
  #   scope:  POST/PUT/PATCH/DELETE on /api/v1/*
  # ----------------------------------------------------------------------
  - name: veza_slo_api_latency
    interval: 30s
    rules:
      # "Bad" = requests slower than 500ms. We compute the ratio of slow
      # requests to total directly from the histogram bucket boundaries.
      # 0.5s bucket is le="0.5" — total minus that bucket = slow.
      - record: veza:slo_api_latency:slow_ratio_5m
        expr: |
          1 - (
            sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[5m]))
            /
            sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[5m]))
          )
      - record: veza:slo_api_latency:slow_ratio_30m
        expr: |
          1 - (
            sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[30m]))
            /
            sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[30m]))
          )
      - record: veza:slo_api_latency:slow_ratio_1h
        expr: |
          1 - (
            sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[1h]))
            /
            sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[1h]))
          )
      - record: veza:slo_api_latency:slow_ratio_6h
        expr: |
          1 - (
            sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend",method=~"POST|PUT|PATCH|DELETE",le="0.5"}[6h]))
            /
            sum(rate(veza_gin_http_request_duration_seconds_count{job="veza-backend",method=~"POST|PUT|PATCH|DELETE"}[6h]))
          )

      - alert: APILatencySLOFastBurn
        expr: |
          (veza:slo_api_latency:slow_ratio_1h > (14.4 * 0.01))
          and
          (veza:slo_api_latency:slow_ratio_5m > (14.4 * 0.01))
        for: 2m
        labels:
          severity: critical
          slo: api_latency
          page: "true"
        annotations:
          summary: "API latency SLO: fast burn (writes slow)"
          description: |
            More than 14.4% of write requests are taking > 500ms over the
            last hour. Likely DB contention, lock storm, or a slow query.
          runbook_url: "https://docs.veza.fr/runbooks/api-latency-slo-burn"

      - alert: APILatencySLOSlowBurn
        expr: |
          (veza:slo_api_latency:slow_ratio_6h > (6 * 0.01))
          and
          (veza:slo_api_latency:slow_ratio_30m > (6 * 0.01))
        for: 15m
        labels:
          severity: warning
          slo: api_latency
          page: "false"
        annotations:
          summary: "API latency SLO: slow burn (writes slow)"
          description: |
            6h slow ratio above 6%. Investigate during hours.
          runbook_url: "https://docs.veza.fr/runbooks/api-latency-slo-burn"

  # ----------------------------------------------------------------------
  # SLO 3 — Payment success (POST /api/v1/orders → 201)
  #   target: 99.5% of order POSTs return 2xx
  # ----------------------------------------------------------------------
  - name: veza_slo_payment_success
    interval: 30s
    rules:
      - record: veza:slo_payment_success:burnrate_5m
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[5m]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[5m]))
      - record: veza:slo_payment_success:burnrate_30m
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[30m]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[30m]))
      - record: veza:slo_payment_success:burnrate_1h
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[1h]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[1h]))
      - record: veza:slo_payment_success:burnrate_6h
        expr: |
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*",status!~"2.."}[6h]))
          /
          sum(rate(veza_gin_http_requests_total{job="veza-backend",method="POST",path=~"/api/v1/orders.*"}[6h]))

      - alert: PaymentSuccessSLOFastBurn
        expr: |
          (veza:slo_payment_success:burnrate_1h > (14.4 * 0.005))
          and
          (veza:slo_payment_success:burnrate_5m > (14.4 * 0.005))
        for: 2m
        labels:
          severity: critical
          slo: payment_success
          page: "true"
        annotations:
          summary: "Payment success SLO: fast burn (orders failing)"
          description: |
            > 7% of POST /api/v1/orders are non-2xx in the last hour.
            Hyperswitch or marketplace pipeline failure — every minute
            is a customer charged but no license issued (or vice versa).
          runbook_url: "https://docs.veza.fr/runbooks/payment-success-slo-burn"

      - alert: PaymentSuccessSLOSlowBurn
        expr: |
          (veza:slo_payment_success:burnrate_6h > (6 * 0.005))
          and
          (veza:slo_payment_success:burnrate_30m > (6 * 0.005))
        for: 15m
        labels:
          severity: warning
          slo: payment_success
          page: "false"
        annotations:
          summary: "Payment success SLO: slow burn (orders failing)"
          description: |
            6h burn rate > 6× — file a ticket, investigate during hours.
          runbook_url: "https://docs.veza.fr/runbooks/payment-success-slo-burn"