veza/config/prometheus/alert_rules.yml

groups:
  - name: veza_critical
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 30 seconds."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on {{ $labels.job }}"
          description: "P99 latency is above 2 seconds for the last 5 minutes."

      - alert: RedisUnreachable
        expr: redis_up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."

  # v1.0.9 Day 8: backup integrity. The dr-drill.sh script writes
  # textfile-collector metrics on every run. Two failure modes are
  # caught:
  #   1. last drill reported a failure (success=0)
  #   2. drill hasn't run in 8+ days (timer broke, runner offline,
  #      script crashed before write_metric)
  # Both are pages because a backup we haven't proved restorable is
  # dette technique waiting for a disaster to bite — finding out at
  # restore-time is too late.
  - name: veza_backup
    rules:
      - alert: BackupRestoreDrillFailed
        expr: veza_backup_drill_last_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "pgBackRest dr-drill last run failed (stanza={{ $labels.stanza }})"
          description: |
            The most recent dr-drill.sh execution reported failure
            (reason={{ $labels.reason }}). Backups exist but a
            restore from them did NOT round-trip the smoke query.
            Investigate via: journalctl -u pgbackrest-drill.service -n 200
            and consider running the drill manually with --keep to
            inspect the restored container before teardown.
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-failed"

      - alert: BackupRestoreDrillStale
        expr: time() - veza_backup_drill_last_run_timestamp_seconds > 691200  # 8 days
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "pgBackRest dr-drill hasn't run in 8+ days"
          description: |
            The dr-drill timer fires weekly (Sun 04:00 UTC). A run
            older than 8 days means the timer is broken, the runner
            is offline, or the script crashed before writing its
            metrics file. Verify with:
              systemctl status pgbackrest-drill.timer
              journalctl -u pgbackrest-drill.service -n 200
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-stale"

  # v1.0.9 W3 Day 12: distributed MinIO health. EC:2 tolerates 2-drive
  # loss before data becomes unavailable, so the alert fires the moment
  # one drive is offline — gives us margin to react before the second
  # failure exhausts redundancy.
  - name: veza_minio
    rules:
      - alert: MinIODriveOffline
        # minio_node_drive_online is 0 when MinIO sees a drive as offline.
        # The metric is exposed by every node (set MINIO_PROMETHEUS_AUTH_TYPE=public)
        # so a single missing scrape doesn't trip the alert.
        expr: min(minio_node_drive_online_total) by (server) < min(minio_node_drive_total) by (server)
        for: 2m
        labels:
          severity: warning
          page: "false"
        annotations:
          summary: "MinIO drive offline on {{ $labels.server }}"
          description: |
            One or more drives report offline on {{ $labels.server }}. EC:2
            still serves reads, but a second drive failure would cause a
            data-unavailability event. Investigate within the hour.
              ssh {{ $labels.server }} sudo journalctl -u minio -n 200
          runbook_url: "https://veza.fr/runbooks/minio-drive-offline"

      - alert: MinIONodesUnreachable
        # > 1 node down on a 4-node EC:2 cluster = redundancy exhausted.
        # Pages the on-call. (Threshold below the 2-drive tolerance because
        # we want the page BEFORE we run out of room for another failure.)
        expr: count(up{job="minio"} == 0) >= 2
        for: 1m
        labels:
          severity: critical
          page: "true"
        annotations:
          summary: "Two or more MinIO nodes unreachable"
          description: |
            EC:2 tolerates 2-drive loss. With 1 drive per node, ≥ 2 nodes
            unreachable means we are at-or-past the redundancy ceiling.
            Any further failure causes data unavailability. Page now.
          runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable"