veza/config/prometheus/alert_rules.yml

groups:
  - name: veza_critical
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 30 seconds."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on {{ $labels.job }}"
          description: "P99 latency is above 2 seconds for the last 5 minutes."

      - alert: RedisUnreachable
        expr: redis_up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."

  # v1.0.9 Day 8: backup integrity. The dr-drill.sh script writes
  # textfile-collector metrics on every run. Two failure modes are
  # caught:
  #   1. last drill reported a failure (success=0)
  #   2. drill hasn't run in 8+ days (timer broke, runner offline,
  #      script crashed before write_metric)
  # Both are pages because a backup we haven't proved restorable is
  # dette technique waiting for a disaster to bite — finding out at
  # restore-time is too late.
  - name: veza_backup
    rules:
      - alert: BackupRestoreDrillFailed
        expr: veza_backup_drill_last_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "pgBackRest dr-drill last run failed (stanza={{ $labels.stanza }})"
          description: |
            The most recent dr-drill.sh execution reported failure
            (reason={{ $labels.reason }}). Backups exist but a
            restore from them did NOT round-trip the smoke query.
            Investigate via: journalctl -u pgbackrest-drill.service -n 200
            and consider running the drill manually with --keep to
            inspect the restored container before teardown.
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-failed"

      - alert: BackupRestoreDrillStale
        expr: time() - veza_backup_drill_last_run_timestamp_seconds > 691200  # 8 days
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "pgBackRest dr-drill hasn't run in 8+ days"
          description: |
            The dr-drill timer fires weekly (Sun 04:00 UTC). A run
            older than 8 days means the timer is broken, the runner
            is offline, or the script crashed before writing its
            metrics file. Verify with:
              systemctl status pgbackrest-drill.timer
              journalctl -u pgbackrest-drill.service -n 200
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-stale"

  # v1.0.9 W3 Day 12: distributed MinIO health. EC:2 tolerates 2-drive
  # loss before data becomes unavailable, so the alert fires the moment
  # one drive is offline — gives us margin to react before the second
  # failure exhausts redundancy.
  - name: veza_minio
    rules:
      - alert: MinIODriveOffline
        # minio_node_drive_online is 0 when MinIO sees a drive as offline.
        # The metric is exposed by every node (set MINIO_PROMETHEUS_AUTH_TYPE=public)
        # so a single missing scrape doesn't trip the alert.
        expr: min(minio_node_drive_online_total) by (server) < min(minio_node_drive_total) by (server)
        for: 2m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "MinIO drive offline on {{ $labels.server }}"
          description: |
            One or more drives report offline on {{ $labels.server }}. EC:2
            still serves reads, but a second drive failure would cause a
            data-unavailability event. Investigate within the hour.
              ssh {{ $labels.server }} sudo journalctl -u minio -n 200
          runbook_url: "https://veza.fr/runbooks/minio-drive-offline"

      - alert: MinIONodesUnreachable
        # > 1 node down on a 4-node EC:2 cluster = redundancy exhausted.
        # Pages the on-call. (Threshold below the 2-drive tolerance because
        # we want the page BEFORE we run out of room for another failure.)
        expr: count(up{job="minio"} == 0) >= 2
        for: 1m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Two or more MinIO nodes unreachable"
          description: |
            EC:2 tolerates 2-drive loss. With 1 drive per node, ≥ 2 nodes
            unreachable means we are at-or-past the redundancy ceiling.
            Any further failure causes data unavailability. Page now.
          runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable"

  # W5+ : Forgejo+Ansible+Incus deploy pipeline. The deploy_app.yml
  # playbook writes a textfile-collector .prom file under
  # /var/lib/node_exporter/textfile_collector/veza_deploy.prom on every
  # deploy attempt. node_exporter scrapes it and exposes the metrics
  # via the standard /metrics endpoint, no Pushgateway needed.
  - name: veza_deploy
    rules:
      - alert: VezaDeployFailed
        # last_failure_timestamp newer than last_success_timestamp.
        # 5m soak so a deploy in progress (writes failure THEN switches
        # back, which writes success on the next successful deploy)
        # doesn't transient-trigger.
        expr: |
          max(veza_deploy_last_failure_timestamp) by (env) >
          max(veza_deploy_last_success_timestamp or vector(0)) by (env)
        for: 5m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Veza deploy to {{ $labels.env }} failed"
          description: |
            The most recent deploy attempt to {{ $labels.env }} failed
            and HAProxy was reverted to the prior color. The failed
            color's containers are kept alive for forensics. Inspect:
              gh workflow run cleanup-failed.yml -f env={{ $labels.env }} -f color=<failed_color>
            once the operator has read the journalctl output.
          runbook_url: "https://veza.fr/runbooks/deploy-failed"

      - alert: VezaStaleDeploy
        # Staging cadence is daily-ish; a 7-day silence smells like
        # CI is broken or the team is on holiday with prod still
        # serving an old SHA. Prod is monthly-ish so 30 days.
        # Two separate alerts because the threshold differs.
        expr: |
          (time() - max(veza_deploy_last_success_timestamp{env="staging"}) by (env)) > (7 * 86400)
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Staging deploy hasn't succeeded in 7+ days"
          description: |
            Last successful staging deploy was
            {{ $value | humanizeDuration }} ago. Pipeline likely broken
            (Forgejo runner offline ? secret expired ?).

      - alert: VezaStaleDeployProd
        expr: |
          (time() - max(veza_deploy_last_success_timestamp{env="prod"}) by (env)) > (30 * 86400)
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Prod deploy hasn't succeeded in 30+ days"
          description: |
            Last successful prod deploy was {{ $value | humanizeDuration }}
            ago. Tag-based release cadence likely stalled.

      - alert: VezaFailedColorAlive
        # The textfile collector also exposes a custom metric
        # `veza_deploy_failed_color_alive{env=...,color=...}` set by
        # a small periodic script that scans `incus list` for
        # containers in the failed-deploy state. (Stub script lives
        # under scripts/observability/scan-failed-colors.sh.)
        # Threshold 24h so the operator has at least a working day
        # to do post-mortem before the alert fires.
        expr: max(veza_deploy_failed_color_alive) by (env, color) > 0
        for: 24h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Failed deploy color {{ $labels.color }} still alive in {{ $labels.env }}"
          description: |
            A previously-failed-deploy color has been kept alive for
            24+ hours. Either complete forensics + run cleanup-failed,
            or the next deploy will recycle it automatically.

  # v1.0.9 W5 Day 24 : synthetic monitoring (blackbox exporter).
  # Each parcours is probed every 5 min ; the 10m `for:` window means
  # an alert fires after 2 consecutive failures (per the roadmap
  # acceptance gate). `parcours` label carries the human-readable
  # name from blackbox_targets.yml so dashboards group cleanly.
  - name: veza_synthetic
    rules:
      - alert: SyntheticParcoursDown
        # probe_success is 0 when blackbox couldn't complete the probe.
        # The metric is emitted per (instance, parcours) so the alert
        # fires per-parcours, letting the on-call see exactly which
        # journey is broken without grepping logs.
        expr: probe_success{probe_kind="synthetic"} == 0
        for: 10m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Synthetic parcours {{ $labels.parcours }} failing for 10m"
          description: |
            Blackbox exporter has been unable to complete the
            {{ $labels.parcours }} parcours against {{ $labels.instance }}
            for 10 minutes (≥ 2 consecutive failures). End-user impact
            is likely real — investigate the underlying component
            BEFORE the related per-component alert fires.
          runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down"

      - alert: SyntheticAuthLoginDown
        # Login is the gate for everything else ; a single 10m blip
        # is critical. Pages.
        expr: probe_success{parcours="auth_login"} == 0
        for: 10m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Synthetic auth_login down — login surface is broken"
          description: |
            The auth_login synthetic parcours has failed for 10+ minutes.
            Real users cannot log in. Page now.
          runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down"

      - alert: SyntheticProbeSlow
        # Probe latency budget : 5s for HTTP, 8s for the heavier ones.
        # When real-user latency degrades, blackbox is the canary.
        expr: probe_duration_seconds{probe_kind="synthetic"} > 8
        for: 15m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Synthetic parcours {{ $labels.parcours }} > 8s for 15m"
          description: |
            Probe duration exceeded 8 seconds for the past 15 minutes.
            Real users are likely seeing visible latency. Cross-check
            the SLO burn-rate alerts ; if those are quiet but this
            fires, the issue is in the synthetic-only path (DNS,
            external dependency).

  # v1.0.10 ops item 10 — Business KPI alerts. Infra alerts catch tech
  # failures (5xx, latency, queue depth). These catch business failures :
  # the platform is technically healthy but users can't sign up, sellers
  # don't get paid, revenue trends down. Source counters live in
  # internal/monitoring/business_metrics.go ; signups + tracks reuse the
  # pre-existing per-feature counters in metrics.go.
  - name: veza_business
    rules:
      - alert: SignupsDropAlarm
        # Compares the last hour's signup rate to the same hour last
        # week. A signup-flow break (frontend bug, captcha provider
        # outage, email-sender broken so the verify link never lands)
        # is invisible on the 5xx dashboard but catastrophic for
        # growth. The 50% threshold is a heuristic — tune up if the
        # weekly seasonality is noisy. Suppressed on weekends because
        # the weekend signup baseline is already noisy enough that
        # paging here would be all false positives.
        expr: |
          (
            sum(rate(veza_users_registered_total[1h]))
            /
            sum(rate(veza_users_registered_total[1h] offset 7d))
          ) < 0.5
          and
          sum(rate(veza_users_registered_total[1h] offset 7d)) > 0.001
          and
          (day_of_week() != 0 and day_of_week() != 6)
        for: 30m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Signups dropped >50% vs same hour last week"
          description: |
            Hourly signup rate is below 50% of the same hour last week.
            Likely causes : signup-flow regression on web, captcha
            provider outage, email-sender broken (verify link never
            arrives), age-gate validation too strict. Check the
            signup funnel dashboard and the auth.register span on
            the OpenTelemetry collector.
          runbook_url: "https://veza.fr/runbooks/signups-drop"

      - alert: LoginsFailureSpike
        # Sudden spike in failed logins is either a real attack
        # (credential stuffing) or an internal bug (auth service
        # broken, password-hash mismatch after a migration).
        # Triggers on >50 failures/min sustained for 10m. The
        # account-takeover signal is the success/failure ratio,
        # but the absolute rate is a better page-trigger because
        # ratio-based alerts flap during low-traffic hours.
        expr: |
          sum(rate(veza_business_logins_total{outcome=~"failure_.*"}[5m])) > 50/60
        for: 10m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Login failure rate >50/min for 10m"
          description: |
            Failed logins spiking. Either a credential-stuffing attack
            (expected source IPs concentrated, check the rate-limit
            audit log) or the auth service is broken (check the
            auth.login span errors and the password-verify code path).
          runbook_url: "https://veza.fr/runbooks/login-failures-spike"

      - alert: PaymentFailuresSpike
        # >20% of orders failing in the last 30 minutes. Real
        # threshold here is "Hyperswitch is sick" or "our webhook
        # signature verification is broken" — both block revenue
        # immediately. The 30m window dampens isolated card declines
        # which are normal background noise.
        expr: |
          (
            sum(rate(veza_business_orders_total{status="failed"}[30m]))
            /
            sum(rate(veza_business_orders_total{status=~"created|completed|failed"}[30m]))
          ) > 0.2
          and
          sum(rate(veza_business_orders_total{status=~"created|completed|failed"}[30m])) > 0.01
        for: 15m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Payment failure rate >20% for 15m"
          description: |
            More than one in five payment attempts failed in the last
            30 minutes. Check Hyperswitch dashboard, the
            payment.webhook span on the OTEL collector, and verify
            the webhook signature secret hasn't been rotated without
            updating ours.
          runbook_url: "https://veza.fr/runbooks/payment-failures"

      - alert: RevenueDropAlarm
        # Same shape as SignupsDropAlarm but on revenue cents.
        # Catches the case where signups are flat but conversion to
        # purchase tanks (broken checkout flow, broken pricing
        # display, exclusive-license duplication blocking sales).
        expr: |
          (
            sum(rate(veza_business_revenue_cents_total[1h]))
            /
            sum(rate(veza_business_revenue_cents_total[1h] offset 7d))
          ) < 0.4
          and
          sum(rate(veza_business_revenue_cents_total[1h] offset 7d)) > 0.001
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Revenue dropped >60% vs same hour last week"
          description: |
            Hourly revenue rate is below 40% of the same hour last
            week. Cross-check with PaymentFailuresSpike : if that's
            quiet, the issue is upstream of payment (checkout flow,
            pricing display, an exclusive-license guard blocking
            otherwise-good orders).
          runbook_url: "https://veza.fr/runbooks/revenue-drop"

      - alert: AccountDeletionEndpointBroken
        # No deletions in 24h is suspicious if the platform has
        # any meaningful churn. The actual signal we want is "the
        # endpoint isn't reachable" — RGPD requires it to stay
        # reachable, and if it's silently broken we're non-compliant.
        # The threshold is loose : as long as ONE deletion lands
        # in 24h, we don't page. If it stays at zero for 48h, page.
        # Skip when the platform has fewer than ~50 active users
        # (early launch) — the rate is genuinely zero.
        expr: |
          increase(veza_business_account_deletions_total[48h]) == 0
          and
          sum(rate(veza_users_registered_total[7d])) > 0.0001
        for: 6h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "No account deletions in 48h — endpoint may be broken"
          description: |
            The /users/me DELETE endpoint hasn't recorded a single
            deletion in 48 hours despite ongoing signup activity.
            Likely the endpoint is broken (RGPD non-compliance risk)
            or the metric instrumentation regressed. Test the
            endpoint manually and check the RecordAccountDeletion
            call site in account_deletion_handler.go.
          runbook_url: "https://veza.fr/runbooks/deletion-endpoint-broken"