veza/config/prometheus/alert_rules_v1.yml

# v0.952: Prometheus alert rules for Veza v1
# API down, error >5%, P99>1s, Redis, PG>80%, disk>90%

groups:
  - name: veza_v1_critical
    rules:
      - alert: APIDown
        expr: up{job=~"veza-backend.*"} == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Veza API {{ $labels.instance }} is down"
          description: "API has been unreachable for more than 30 seconds."

      - alert: HighErrorRate
        expr: |
          (
            sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m]))
            / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))
          ) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on Veza API"
          description: "5xx error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: |
          histogram_quantile(0.99,
            sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on Veza API"
          description: "P99 latency is above 1 second for the last 5 minutes."

      - alert: RedisDown
        expr: redis_up == 0 or up{job=~"redis.*"} == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."

      - alert: PostgresPoolHigh
        expr: |
          (veza_db_pool_in_use{job="veza-backend"} / veza_db_pool_max_open_connections{job="veza-backend"}) > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PostgreSQL pool usage above 80%"
          description: "DB connection pool usage exceeds 80%."

      - alert: DiskSpaceLow
        expr: |
          (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Disk space below 10%"
          description: "Free disk space is less than 10% on root filesystem."