veza/config/prometheus/alert_rules.yml

groups:
  - name: veza_critical
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 30 seconds."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on {{ $labels.job }}"
          description: "P99 latency is above 2 seconds for the last 5 minutes."

      - alert: RedisUnreachable
        expr: redis_up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."

  # v1.0.9 Day 8: backup integrity. The dr-drill.sh script writes
  # textfile-collector metrics on every run. Two failure modes are
  # caught:
  #   1. last drill reported a failure (success=0)
  #   2. drill hasn't run in 8+ days (timer broke, runner offline,
  #      script crashed before write_metric)
  # Both are pages because a backup we haven't proved restorable is
  # dette technique waiting for a disaster to bite — finding out at
  # restore-time is too late.
  - name: veza_backup
    rules:
      - alert: BackupRestoreDrillFailed
        expr: veza_backup_drill_last_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "pgBackRest dr-drill last run failed (stanza={{ $labels.stanza }})"
          description: |
            The most recent dr-drill.sh execution reported failure
            (reason={{ $labels.reason }}). Backups exist but a
            restore from them did NOT round-trip the smoke query.
            Investigate via: journalctl -u pgbackrest-drill.service -n 200
            and consider running the drill manually with --keep to
            inspect the restored container before teardown.
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-failed"

      - alert: BackupRestoreDrillStale
        expr: time() - veza_backup_drill_last_run_timestamp_seconds > 691200  # 8 days
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "pgBackRest dr-drill hasn't run in 8+ days"
          description: |
            The dr-drill timer fires weekly (Sun 04:00 UTC). A run
            older than 8 days means the timer is broken, the runner
            is offline, or the script crashed before writing its
            metrics file. Verify with:
              systemctl status pgbackrest-drill.timer
              journalctl -u pgbackrest-drill.service -n 200
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-stale"

  # v1.0.9 W3 Day 12: distributed MinIO health. EC:2 tolerates 2-drive
  # loss before data becomes unavailable, so the alert fires the moment
  # one drive is offline — gives us margin to react before the second
  # failure exhausts redundancy.
  - name: veza_minio
    rules:
      - alert: MinIODriveOffline
        # minio_node_drive_online is 0 when MinIO sees a drive as offline.
        # The metric is exposed by every node (set MINIO_PROMETHEUS_AUTH_TYPE=public)
        # so a single missing scrape doesn't trip the alert.
        expr: min(minio_node_drive_online_total) by (server) < min(minio_node_drive_total) by (server)
        for: 2m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "MinIO drive offline on {{ $labels.server }}"
          description: |
            One or more drives report offline on {{ $labels.server }}. EC:2
            still serves reads, but a second drive failure would cause a
            data-unavailability event. Investigate within the hour.
              ssh {{ $labels.server }} sudo journalctl -u minio -n 200
          runbook_url: "https://veza.fr/runbooks/minio-drive-offline"

      - alert: MinIONodesUnreachable
        # > 1 node down on a 4-node EC:2 cluster = redundancy exhausted.
        # Pages the on-call. (Threshold below the 2-drive tolerance because
        # we want the page BEFORE we run out of room for another failure.)
        expr: count(up{job="minio"} == 0) >= 2
        for: 1m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Two or more MinIO nodes unreachable"
          description: |
            EC:2 tolerates 2-drive loss. With 1 drive per node, ≥ 2 nodes
            unreachable means we are at-or-past the redundancy ceiling.
            Any further failure causes data unavailability. Page now.
          runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable"

  # W5+ : Forgejo+Ansible+Incus deploy pipeline. The deploy_app.yml
  # playbook writes a textfile-collector .prom file under
  # /var/lib/node_exporter/textfile_collector/veza_deploy.prom on every
  # deploy attempt. node_exporter scrapes it and exposes the metrics
  # via the standard /metrics endpoint, no Pushgateway needed.
  - name: veza_deploy
    rules:
      - alert: VezaDeployFailed
        # last_failure_timestamp newer than last_success_timestamp.
        # 5m soak so a deploy in progress (writes failure THEN switches
        # back, which writes success on the next successful deploy)
        # doesn't transient-trigger.
        expr: |
          max(veza_deploy_last_failure_timestamp) by (env) >
          max(veza_deploy_last_success_timestamp or vector(0)) by (env)
        for: 5m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Veza deploy to {{ $labels.env }} failed"
          description: |
            The most recent deploy attempt to {{ $labels.env }} failed
            and HAProxy was reverted to the prior color. The failed
            color's containers are kept alive for forensics. Inspect:
              gh workflow run cleanup-failed.yml -f env={{ $labels.env }} -f color=<failed_color>
            once the operator has read the journalctl output.
          runbook_url: "https://veza.fr/runbooks/deploy-failed"

      - alert: VezaStaleDeploy
        # Staging cadence is daily-ish; a 7-day silence smells like
        # CI is broken or the team is on holiday with prod still
        # serving an old SHA. Prod is monthly-ish so 30 days.
        # Two separate alerts because the threshold differs.
        expr: |
          (time() - max(veza_deploy_last_success_timestamp{env="staging"}) by (env)) > (7 * 86400)
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Staging deploy hasn't succeeded in 7+ days"
          description: |
            Last successful staging deploy was
            {{ $value | humanizeDuration }} ago. Pipeline likely broken
            (Forgejo runner offline ? secret expired ?).

      - alert: VezaStaleDeployProd
        expr: |
          (time() - max(veza_deploy_last_success_timestamp{env="prod"}) by (env)) > (30 * 86400)
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Prod deploy hasn't succeeded in 30+ days"
          description: |
            Last successful prod deploy was {{ $value | humanizeDuration }}
            ago. Tag-based release cadence likely stalled.

      - alert: VezaFailedColorAlive
        # The textfile collector also exposes a custom metric
        # `veza_deploy_failed_color_alive{env=...,color=...}` set by
        # a small periodic script that scans `incus list` for
        # containers in the failed-deploy state. (Stub script lives
        # under scripts/observability/scan-failed-colors.sh.)
        # Threshold 24h so the operator has at least a working day
        # to do post-mortem before the alert fires.
        expr: max(veza_deploy_failed_color_alive) by (env, color) > 0
        for: 24h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Failed deploy color {{ $labels.color }} still alive in {{ $labels.env }}"
          description: |
            A previously-failed-deploy color has been kept alive for
            24+ hours. Either complete forensics + run cleanup-failed,
            or the next deploy will recycle it automatically.

  # v1.0.9 W5 Day 24 : synthetic monitoring (blackbox exporter).
  # Each parcours is probed every 5 min ; the 10m `for:` window means
  # an alert fires after 2 consecutive failures (per the roadmap
  # acceptance gate). `parcours` label carries the human-readable
  # name from blackbox_targets.yml so dashboards group cleanly.
  - name: veza_synthetic
    rules:
      - alert: SyntheticParcoursDown
        # probe_success is 0 when blackbox couldn't complete the probe.
        # The metric is emitted per (instance, parcours) so the alert
        # fires per-parcours, letting the on-call see exactly which
        # journey is broken without grepping logs.
        expr: probe_success{probe_kind="synthetic"} == 0
        for: 10m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Synthetic parcours {{ $labels.parcours }} failing for 10m"
          description: |
            Blackbox exporter has been unable to complete the
            {{ $labels.parcours }} parcours against {{ $labels.instance }}
            for 10 minutes (≥ 2 consecutive failures). End-user impact
            is likely real — investigate the underlying component
            BEFORE the related per-component alert fires.
          runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down"

      - alert: SyntheticAuthLoginDown
        # Login is the gate for everything else ; a single 10m blip
        # is critical. Pages.
        expr: probe_success{parcours="auth_login"} == 0
        for: 10m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Synthetic auth_login down — login surface is broken"
          description: |
            The auth_login synthetic parcours has failed for 10+ minutes.
            Real users cannot log in. Page now.
          runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down"

      - alert: SyntheticProbeSlow
        # Probe latency budget : 5s for HTTP, 8s for the heavier ones.
        # When real-user latency degrades, blackbox is the canary.
        expr: probe_duration_seconds{probe_kind="synthetic"} > 8
        for: 15m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Synthetic parcours {{ $labels.parcours }} > 8s for 15m"
          description: |
            Probe duration exceeded 8 seconds for the past 15 minutes.
            Real users are likely seeing visible latency. Cross-check
            the SLO burn-rate alerts ; if those are quiet but this
            fires, the issue is in the synthetic-only path (DNS,
            external dependency).

  # v1.0.10 ops item 10 — Business KPI alerts. Infra alerts catch tech
  # failures (5xx, latency, queue depth). These catch business failures :
  # the platform is technically healthy but users can't sign up, sellers
  # don't get paid, revenue trends down. Source counters live in
  # internal/monitoring/business_metrics.go ; signups + tracks reuse the
  # pre-existing per-feature counters in metrics.go.
  - name: veza_business
    rules:
      - alert: SignupsDropAlarm
        # Compares the last hour's signup rate to the same hour last
        # week. A signup-flow break (frontend bug, captcha provider
        # outage, email-sender broken so the verify link never lands)
        # is invisible on the 5xx dashboard but catastrophic for
        # growth. The 50% threshold is a heuristic — tune up if the
        # weekly seasonality is noisy. Suppressed on weekends because
        # the weekend signup baseline is already noisy enough that
        # paging here would be all false positives.
        expr: |
          (
            sum(rate(veza_users_registered_total[1h]))
            /
            sum(rate(veza_users_registered_total[1h] offset 7d))
          ) < 0.5
          and
          sum(rate(veza_users_registered_total[1h] offset 7d)) > 0.001
          and
          (day_of_week() != 0 and day_of_week() != 6)
        for: 30m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Signups dropped >50% vs same hour last week"
          description: |
            Hourly signup rate is below 50% of the same hour last week.
            Likely causes : signup-flow regression on web, captcha
            provider outage, email-sender broken (verify link never
            arrives), age-gate validation too strict. Check the
            signup funnel dashboard and the auth.register span on
            the OpenTelemetry collector.
          runbook_url: "https://veza.fr/runbooks/signups-drop"

      - alert: LoginsFailureSpike
        # Sudden spike in failed logins is either a real attack
        # (credential stuffing) or an internal bug (auth service
        # broken, password-hash mismatch after a migration).
        # Triggers on >50 failures/min sustained for 10m. The
        # account-takeover signal is the success/failure ratio,
        # but the absolute rate is a better page-trigger because
        # ratio-based alerts flap during low-traffic hours.
        expr: |
          sum(rate(veza_business_logins_total{outcome=~"failure_.*"}[5m])) > 50/60
        for: 10m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Login failure rate >50/min for 10m"
          description: |
            Failed logins spiking. Either a credential-stuffing attack
            (expected source IPs concentrated, check the rate-limit
            audit log) or the auth service is broken (check the
            auth.login span errors and the password-verify code path).
          runbook_url: "https://veza.fr/runbooks/login-failures-spike"

      - alert: PaymentFailuresSpike
        # >20% of orders failing in the last 30 minutes. Real
        # threshold here is "Hyperswitch is sick" or "our webhook
        # signature verification is broken" — both block revenue
        # immediately. The 30m window dampens isolated card declines
        # which are normal background noise.
        expr: |
          (
            sum(rate(veza_business_orders_total{status="failed"}[30m]))
            /
            sum(rate(veza_business_orders_total{status=~"created|completed|failed"}[30m]))
          ) > 0.2
          and
          sum(rate(veza_business_orders_total{status=~"created|completed|failed"}[30m])) > 0.01
        for: 15m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Payment failure rate >20% for 15m"
          description: |
            More than one in five payment attempts failed in the last
            30 minutes. Check Hyperswitch dashboard, the
            payment.webhook span on the OTEL collector, and verify
            the webhook signature secret hasn't been rotated without
            updating ours.
          runbook_url: "https://veza.fr/runbooks/payment-failures"

      - alert: RevenueDropAlarm
        # Same shape as SignupsDropAlarm but on revenue cents.
        # Catches the case where signups are flat but conversion to
        # purchase tanks (broken checkout flow, broken pricing
        # display, exclusive-license duplication blocking sales).
        expr: |
          (
            sum(rate(veza_business_revenue_cents_total[1h]))
            /
            sum(rate(veza_business_revenue_cents_total[1h] offset 7d))
          ) < 0.4
          and
          sum(rate(veza_business_revenue_cents_total[1h] offset 7d)) > 0.001
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Revenue dropped >60% vs same hour last week"
          description: |
            Hourly revenue rate is below 40% of the same hour last
            week. Cross-check with PaymentFailuresSpike : if that's
            quiet, the issue is upstream of payment (checkout flow,
            pricing display, an exclusive-license guard blocking
            otherwise-good orders).
          runbook_url: "https://veza.fr/runbooks/revenue-drop"

      - alert: AccountDeletionEndpointBroken
        # No deletions in 24h is suspicious if the platform has
        # any meaningful churn. The actual signal we want is "the
        # endpoint isn't reachable" — RGPD requires it to stay
        # reachable, and if it's silently broken we're non-compliant.
        # The threshold is loose : as long as ONE deletion lands
        # in 24h, we don't page. If it stays at zero for 48h, page.
        # Skip when the platform has fewer than ~50 active users
        # (early launch) — the rate is genuinely zero.
        expr: |
          increase(veza_business_account_deletions_total[48h]) == 0
          and
          sum(rate(veza_users_registered_total[7d])) > 0.0001
        for: 6h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "No account deletions in 48h — endpoint may be broken"
          description: |
            The /users/me DELETE endpoint hasn't recorded a single
            deletion in 48 hours despite ongoing signup activity.
            Likely the endpoint is broken (RGPD non-compliance risk)
            or the metric instrumentation regressed. Test the
            endpoint manually and check the RecordAccountDeletion
            call site in account_deletion_handler.go.
          runbook_url: "https://veza.fr/runbooks/deletion-endpoint-broken"

  # v1.0.10 ops item 9 — Real User Monitoring alerts. Synthetic probes
  # already alert on server-side latency ; these alerts catch the
  # "users in the wild are seeing it slow even though our infra
  # dashboards are green" gap (slow CDN edges, third-party scripts,
  # bloated bundle on a route, mobile-CPU regressions). The alerts
  # fire when the p75 user experience crosses Google's published
  # Web Vitals thresholds for a sustained window.
  - name: veza_rum
    rules:
      - alert: WebVitalsLCPP75Poor
        # p75 LCP > 4s for 30m on the same route+device. 4s is the
        # Google "poor" threshold ; we page only on the worst
        # category because "needs improvement" (2.5–4s) is a
        # backlog signal, not an incident.
        expr: |
          histogram_quantile(0.75, sum(rate(veza_web_vitals_lcp_seconds_bucket[15m])) by (route, device, le)) > 4.0
          and
          sum(rate(veza_web_vitals_lcp_seconds_count[15m])) by (route, device) > 0.05
        for: 30m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "LCP p75 > 4s on {{ $labels.route }}/{{ $labels.device }} for 30m"
          description: |
            Real users on route={{ $labels.route }} device={{ $labels.device }}
            are seeing Largest Contentful Paint above the Google
            "poor" threshold. Usually : a heavy hero image / late-
            loading font / large bundle on this route. Check the
            bundle-size CI artifact, the CDN cache hit rate for
            this route's HTML, and recent third-party script
            additions.
          runbook_url: "https://veza.fr/runbooks/web-vitals-lcp"

      - alert: WebVitalsCLSP75Poor
        # p75 CLS > 0.25 for 30m on the same route+device. Layout
        # shift > 0.25 is the "poor" category — usually caused by
        # late-loading images without dimensions, ad slots inserted
        # post-paint, or font swaps.
        expr: |
          histogram_quantile(0.75, sum(rate(veza_web_vitals_cls_bucket[15m])) by (route, device, le)) > 0.25
          and
          sum(rate(veza_web_vitals_cls_count[15m])) by (route, device) > 0.05
        for: 30m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "CLS p75 > 0.25 on {{ $labels.route }}/{{ $labels.device }} for 30m"
          description: |
            Real users are seeing Cumulative Layout Shift above the
            "poor" threshold on route={{ $labels.route }}. Usually :
            an image / iframe / ad without explicit width/height,
            a font swap shifting paragraphs down, or content
            injected post-paint. Inspect the route's React tree
            for new dynamic elements.
          runbook_url: "https://veza.fr/runbooks/web-vitals-cls"

      - alert: WebVitalsINPP75Poor
        # p75 INP > 0.5s for 30m. INP measures interaction
        # responsiveness — > 500ms is genuinely sluggish UI.
        # Often caused by a heavy event handler or main-thread
        # blocking from a third-party script.
        expr: |
          histogram_quantile(0.75, sum(rate(veza_web_vitals_inp_seconds_bucket[15m])) by (route, device, le)) > 0.5
          and
          sum(rate(veza_web_vitals_inp_seconds_count[15m])) by (route, device) > 0.05
        for: 30m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "INP p75 > 500ms on {{ $labels.route }}/{{ $labels.device }} for 30m"
          description: |
            Real users see > 500ms response after interaction. Look
            at recent commits to the React tree for this route
            (heavy onClick handlers, synchronous state updates that
            re-render large subtrees), and the third-party scripts
            loaded on this route (analytics, chat widgets).
          runbook_url: "https://veza.fr/runbooks/web-vitals-inp"

      - alert: WebVitalsBeaconsStopped
        # No beacons in 30m on a window where we expect them.
        # Usually means : frontend instrumentation broke, the
        # endpoint is rejecting (CORS / 4xx), or the CDN is
        # blocking the POST. Compares to the same-time-of-day
        # baseline 24h ago to avoid pages during low-traffic
        # nights.
        expr: |
          sum(rate(veza_web_vitals_beacons_total[15m])) == 0
          and
          sum(rate(veza_web_vitals_beacons_total[15m] offset 24h)) > 0.05
        for: 30m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "RUM beacons stopped flowing for 30m"
          description: |
            No Web Vitals beacons received in 30 minutes despite
            yesterday's same-hour baseline showing traffic. Likely :
            frontend webVitals.ts module crashed, the
            /api/v1/observability/web-vitals endpoint is rejecting
            (check WebVitalsRejectedTotal), or a CDN / WAF rule
            is blocking the POST.
          runbook_url: "https://veza.fr/runbooks/web-vitals-beacons-stopped"

  # v1.0.10 ops item 11 — DB pool monitoring + N+1 detection alerts.
  # The pool gauges are fed by StartPoolStatsExporter at 15s ticks.
  # The query-count histogram is fed by the N1QueryCounter
  # middleware on every HTTP request. Together these alerts catch :
  # - Pool exhaustion under load (more requests want a connection
  #   than the pool has, requests pile up waiting)
  # - N+1 regressions on a specific route (the ListXxx handler that
  #   used to do 1 query now does 200)
  - name: veza_db_pool_n1
    rules:
      - alert: PoolExhaustionImminent
        # in_use ≥ 90% of MaxOpenConns sustained for 5m. The 90%
        # is a head-room buffer ; once we hit 100% the next request
        # blocks waiting for a free connection and request latency
        # spikes. Acting at 90% gives ops a chance to scale out
        # before the visible incident.
        # MaxOpenConns is configurable via DB_MAX_OPEN_CONNS ; the
        # default is 50 in pool.go. The expression hardcodes 50 in
        # the threshold because Prometheus can't read app env vars
        # — bump this if DB_MAX_OPEN_CONNS is raised.
        expr: |
          veza_db_connections{state="in_use"} >= 45
        for: 5m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "DB pool 90%+ saturated for 5m"
          description: |
            Pool in_use ≥ 45 / 50. Either the DB is slow (queries
            taking longer to release connections) or request volume
            spiked. Check the slow-query log + the request-rate
            dashboard, and scale the pool via DB_MAX_OPEN_CONNS
            if this is a sustained baseline shift.
          runbook_url: "https://veza.fr/runbooks/pool-exhaustion"

      - alert: PoolStatsExporterStuck
        # Pool gauges haven't moved in 5m. Either the exporter
        # goroutine died or sql.DB.Stats() is consistently
        # returning the same value (a real but rare scenario,
        # usually a sign of a hung connection somewhere).
        expr: |
          changes(veza_db_connections{state="open"}[5m]) == 0
          and
          rate(http_requests_total[5m]) > 1
        for: 10m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "DB pool gauges frozen for 10m despite traffic"
          description: |
            The pool-stats exporter goroutine appears stuck (no
            change in open-connection count for 10m even though
            HTTP traffic is flowing). Restart the API process or
            inspect the journal for "pool stats sample failed"
            warnings.
          runbook_url: "https://veza.fr/runbooks/pool-exhaustion"

      - alert: N1QuerySpike
        # > 3% of recent requests on a route exceed the N+1
        # threshold. The histogram bucket boundaries are 1, 5,
        # 10, 20, 50, 100, 200, 500, 1000 — anything in the
        # >100 range is firmly N+1 territory regardless of route.
        # The 3% noise floor avoids paging on the occasional
        # admin batch operation that legitimately sweeps tables.
        expr: |
          (
            sum(rate(veza_db_n1_suspicions_total[10m])) by (route)
            /
            sum(rate(veza_db_request_query_count_count[10m])) by (route)
          ) > 0.03
          and
          sum(rate(veza_db_request_query_count_count[10m])) by (route) > 0.05
        for: 15m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "N+1 suspicion rate >3% on {{ $labels.route }} for 15m"
          description: |
            More than 3% of recent requests to {{ $labels.route }}
            ran more queries than the N+1 threshold (default 50).
            Check the handler for missing Preload calls or
            loop-of-singles patterns. The slow-query log will
            show the per-row queries.
          runbook_url: "https://veza.fr/runbooks/n-plus-one"

      - alert: SlowQuerySustained
        # > 2 slow queries / minute sustained for 15m on the
        # same operation+table. Slow query is anything over the
        # PerformanceMonitor threshold (default 1s). One-off
        # slow queries are noise ; sustained slow queries on the
        # same table point at a missing index or a regressed plan.
        expr: |
          sum(rate(veza_db_slow_queries_total[10m])) by (operation, table) > 0.033
        for: 15m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Slow {{ $labels.operation }} on {{ $labels.table }} >2/min for 15m"
          description: |
            Sustained slow queries on table={{ $labels.table }}
            operation={{ $labels.operation }}. Likely : missing
            index, plan regression after a recent schema change,
            or a suddenly-large table hitting a sequential scan.
            Check pg_stat_statements + the slow-query log.
          runbook_url: "https://veza.fr/runbooks/slow-query-sustained"

  # v1.0.10 ops item 8 — MinIO cross-region replication alerts.
  # The mc mirror script writes textfile metrics on every run ;
  # node_exporter exposes them as veza_minio_replication_*. Two
  # failure modes are caught :
  #   1. Last run failed (status=0) — actionable now.
  #   2. No successful run in > 12h — the timer broke, the host
  #      is offline, or the wrapper died before writing. Either
  #      way the backup we have is older than the RPO (6h cadence
  #      doubled = 12h).
  - name: veza_minio_backup
    rules:
      - alert: MinioReplicationLastFailed
        # The last replication attempt returned non-zero. Pages
        # at warning because a single failed run is recoverable
        # (the next 6h tick retries) — but the operator should
        # see it surfaced rather than discover it at restore time.
        expr: |
          veza_minio_replication_last_status == 0
        for: 5m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Last MinIO replication run for {{ $labels.bucket }} failed"
          description: |
            veza-minio-replicate.service last completed with a
            non-zero exit. Check `journalctl -u veza-minio-replicate`
            for the mc-mirror error (network ? permissions ?
            target full ?). The next timer tick will retry — if
            this fires repeatedly the issue is configuration,
            not transient.
          runbook_url: "https://veza.fr/runbooks/minio-replication"

      - alert: MinioReplicationStale
        # > 12h since the last *successful* replication. Pages at
        # critical because once we cross the RPO the backup we hold
        # is no longer fit for purpose — restoring from it would
        # lose more user data than the documented SLA permits.
        # The underlying timer fires every 6h, so any value above
        # ~14h indicates either the timer is broken or the script
        # has been failing for at least 2 ticks.
        expr: |
          (time() - veza_minio_replication_last_success_timestamp_seconds) > 12 * 60 * 60
          and
          veza_minio_replication_last_success_timestamp_seconds > 0
        for: 30m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "MinIO replication for {{ $labels.bucket }} hasn't succeeded in 12+ hours"
          description: |
            No successful replication in 12+ hours. RPO is 6h, so
            we're past it. Likely : timer disabled / unit failed
            to load, host offline, mc remote alias broken, target
            cluster down. Check `systemctl status veza-minio-replicate.timer`
            and the wrapper script's last journal output.
          runbook_url: "https://veza.fr/runbooks/minio-replication"

      - alert: MinioReplicationNeverSucceeded
        # Never-succeeded gauge stuck at 0 for 24h+ on a bucket
        # that the prod cluster IS writing to. Different from
        # Stale above : Stale assumes there was at least one
        # success at some point ; this catches the freshly-deployed
        # role that hasn't gone through a single green run.
        expr: |
          veza_minio_replication_last_success_timestamp_seconds == 0
          and
          veza_minio_replication_last_run_timestamp_seconds > 0
        for: 24h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "MinIO replication for {{ $labels.bucket }} has never succeeded"
          description: |
            The replication script has run at least once but
            never returned success. Almost certainly a config
            error : wrong remote endpoint, bad credentials, or
            a target bucket that doesn't exist and can't be
            created (permission). Inspect the most recent
            journalctl output for veza-minio-replicate and fix
            before the first DR scenario lands.
          runbook_url: "https://veza.fr/runbooks/minio-replication"

      - alert: MinioReplicationTargetShrunk
        # The target bucket size dropped > 20% in 1h. A legitimate
        # cause is a lifecycle policy aging-out old data, but a
        # 20% drop in an hour is implausible from lifecycle alone.
        # More commonly : someone fat-fingered a bucket policy or
        # ran `mc rb --force` against the wrong alias.
        expr: |
          (
            (veza_minio_replication_target_bytes offset 1h)
            -
            veza_minio_replication_target_bytes
          )
          /
          (veza_minio_replication_target_bytes offset 1h)
          > 0.2
          and
          (veza_minio_replication_target_bytes offset 1h) > 1e9
        for: 5m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "MinIO replication target bucket {{ $labels.bucket }} shrunk >20% in 1h"
          description: |
            The replication target bucket lost more than 20% of
            its size in the past hour. Almost always operator
            error (wrong bucket targeted by mc rb, lifecycle
            misconfiguration, or `--remove` flag accidentally
            enabled when the source had a mass deletion). Pause
            the timer immediately :
                systemctl stop veza-minio-replicate.timer
            and investigate before the next tick propagates the
            damage further.
          runbook_url: "https://veza.fr/runbooks/minio-replication"