veza/config/prometheus/alert_rules.yml

groups:
  - name: veza_critical
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 30 seconds."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on {{ $labels.job }}"
          description: "P99 latency is above 2 seconds for the last 5 minutes."

      - alert: RedisUnreachable
        expr: redis_up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."

  # v1.0.9 Day 8: backup integrity. The dr-drill.sh script writes
  # textfile-collector metrics on every run. Two failure modes are
  # caught:
  #   1. last drill reported a failure (success=0)
  #   2. drill hasn't run in 8+ days (timer broke, runner offline,
  #      script crashed before write_metric)
  # Both are pages because a backup we haven't proved restorable is
  # dette technique waiting for a disaster to bite — finding out at
  # restore-time is too late.
  - name: veza_backup
    rules:
      - alert: BackupRestoreDrillFailed
        expr: veza_backup_drill_last_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "pgBackRest dr-drill last run failed (stanza={{ $labels.stanza }})"
          description: |
            The most recent dr-drill.sh execution reported failure
            (reason={{ $labels.reason }}). Backups exist but a
            restore from them did NOT round-trip the smoke query.
            Investigate via: journalctl -u pgbackrest-drill.service -n 200
            and consider running the drill manually with --keep to
            inspect the restored container before teardown.
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-failed"

      - alert: BackupRestoreDrillStale
        expr: time() - veza_backup_drill_last_run_timestamp_seconds > 691200  # 8 days
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "pgBackRest dr-drill hasn't run in 8+ days"
          description: |
            The dr-drill timer fires weekly (Sun 04:00 UTC). A run
            older than 8 days means the timer is broken, the runner
            is offline, or the script crashed before writing its
            metrics file. Verify with:
              systemctl status pgbackrest-drill.timer
              journalctl -u pgbackrest-drill.service -n 200
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-stale"

  # v1.0.9 W3 Day 12: distributed MinIO health. EC:2 tolerates 2-drive
  # loss before data becomes unavailable, so the alert fires the moment
  # one drive is offline — gives us margin to react before the second
  # failure exhausts redundancy.
  - name: veza_minio
    rules:
      - alert: MinIODriveOffline
        # minio_node_drive_online is 0 when MinIO sees a drive as offline.
        # The metric is exposed by every node (set MINIO_PROMETHEUS_AUTH_TYPE=public)
        # so a single missing scrape doesn't trip the alert.
        expr: min(minio_node_drive_online_total) by (server) < min(minio_node_drive_total) by (server)
        for: 2m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "MinIO drive offline on {{ $labels.server }}"
          description: |
            One or more drives report offline on {{ $labels.server }}. EC:2
            still serves reads, but a second drive failure would cause a
            data-unavailability event. Investigate within the hour.
              ssh {{ $labels.server }} sudo journalctl -u minio -n 200
          runbook_url: "https://veza.fr/runbooks/minio-drive-offline"

      - alert: MinIONodesUnreachable
        # > 1 node down on a 4-node EC:2 cluster = redundancy exhausted.
        # Pages the on-call. (Threshold below the 2-drive tolerance because
        # we want the page BEFORE we run out of room for another failure.)
        expr: count(up{job="minio"} == 0) >= 2
        for: 1m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Two or more MinIO nodes unreachable"
          description: |
            EC:2 tolerates 2-drive loss. With 1 drive per node, ≥ 2 nodes
            unreachable means we are at-or-past the redundancy ceiling.
            Any further failure causes data unavailability. Page now.
          runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable"

  # W5+ : Forgejo+Ansible+Incus deploy pipeline. The deploy_app.yml
  # playbook writes a textfile-collector .prom file under
  # /var/lib/node_exporter/textfile_collector/veza_deploy.prom on every
  # deploy attempt. node_exporter scrapes it and exposes the metrics
  # via the standard /metrics endpoint, no Pushgateway needed.
  - name: veza_deploy
    rules:
      - alert: VezaDeployFailed
        # last_failure_timestamp newer than last_success_timestamp.
        # 5m soak so a deploy in progress (writes failure THEN switches
        # back, which writes success on the next successful deploy)
        # doesn't transient-trigger.
        expr: |
          max(veza_deploy_last_failure_timestamp) by (env) >
          max(veza_deploy_last_success_timestamp or vector(0)) by (env)
        for: 5m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Veza deploy to {{ $labels.env }} failed"
          description: |
            The most recent deploy attempt to {{ $labels.env }} failed
            and HAProxy was reverted to the prior color. The failed
            color's containers are kept alive for forensics. Inspect:
              gh workflow run cleanup-failed.yml -f env={{ $labels.env }} -f color=<failed_color>
            once the operator has read the journalctl output.
          runbook_url: "https://veza.fr/runbooks/deploy-failed"

      - alert: VezaStaleDeploy
        # Staging cadence is daily-ish; a 7-day silence smells like
        # CI is broken or the team is on holiday with prod still
        # serving an old SHA. Prod is monthly-ish so 30 days.
        # Two separate alerts because the threshold differs.
        expr: |
          (time() - max(veza_deploy_last_success_timestamp{env="staging"}) by (env)) > (7 * 86400)
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Staging deploy hasn't succeeded in 7+ days"
          description: |
            Last successful staging deploy was
            {{ $value | humanizeDuration }} ago. Pipeline likely broken
            (Forgejo runner offline ? secret expired ?).

      - alert: VezaStaleDeployProd
        expr: |
          (time() - max(veza_deploy_last_success_timestamp{env="prod"}) by (env)) > (30 * 86400)
        for: 1h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Prod deploy hasn't succeeded in 30+ days"
          description: |
            Last successful prod deploy was {{ $value | humanizeDuration }}
            ago. Tag-based release cadence likely stalled.

      - alert: VezaFailedColorAlive
        # The textfile collector also exposes a custom metric
        # `veza_deploy_failed_color_alive{env=...,color=...}` set by
        # a small periodic script that scans `incus list` for
        # containers in the failed-deploy state. (Stub script lives
        # under scripts/observability/scan-failed-colors.sh.)
        # Threshold 24h so the operator has at least a working day
        # to do post-mortem before the alert fires.
        expr: max(veza_deploy_failed_color_alive) by (env, color) > 0
        for: 24h
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "Failed deploy color {{ $labels.color }} still alive in {{ $labels.env }}"
          description: |
            A previously-failed-deploy color has been kept alive for
            24+ hours. Either complete forensics + run cleanup-failed,
            or the next deploy will recycle it automatically.