veza/config/prometheus/alert_rules.yml

groups:
  - name: veza_critical
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 30 seconds."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on {{ $labels.job }}"
          description: "P99 latency is above 2 seconds for the last 5 minutes."

      - alert: RedisUnreachable
        expr: redis_up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."

  # v1.0.9 Day 8: backup integrity. The dr-drill.sh script writes
  # textfile-collector metrics on every run. Two failure modes are
  # caught:
  #   1. last drill reported a failure (success=0)
  #   2. drill hasn't run in 8+ days (timer broke, runner offline,
  #      script crashed before write_metric)
  # Both are pages because a backup we haven't proved restorable is
  # dette technique waiting for a disaster to bite — finding out at
  # restore-time is too late.
  - name: veza_backup
    rules:
      - alert: BackupRestoreDrillFailed
        expr: veza_backup_drill_last_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "pgBackRest dr-drill last run failed (stanza={{ $labels.stanza }})"
          description: |
            The most recent dr-drill.sh execution reported failure
            (reason={{ $labels.reason }}). Backups exist but a
            restore from them did NOT round-trip the smoke query.
            Investigate via: journalctl -u pgbackrest-drill.service -n 200
            and consider running the drill manually with --keep to
            inspect the restored container before teardown.
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-failed"

      - alert: BackupRestoreDrillStale
        expr: time() - veza_backup_drill_last_run_timestamp_seconds > 691200  # 8 days
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "pgBackRest dr-drill hasn't run in 8+ days"
          description: |
            The dr-drill timer fires weekly (Sun 04:00 UTC). A run
            older than 8 days means the timer is broken, the runner
            is offline, or the script crashed before writing its
            metrics file. Verify with:
              systemctl status pgbackrest-drill.timer
              journalctl -u pgbackrest-drill.service -n 200
          runbook_url: "https://veza.fr/runbooks/backup-restore-drill-stale"