# v0.952: Prometheus alert rules for Veza v1 # API down, error >5%, P99>1s, Redis, PG>80%, disk>90% groups: - name: veza_v1_critical rules: - alert: APIDown expr: up{job=~"veza-backend.*"} == 0 for: 30s labels: severity: critical annotations: summary: "Veza API {{ $labels.instance }} is down" description: "API has been unreachable for more than 30 seconds." - alert: HighErrorRate expr: | ( sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m])) ) > 0.05 for: 5m labels: severity: warning annotations: summary: "High error rate on Veza API" description: "5xx error rate is above 5% for the last 5 minutes." - alert: HighLatencyP99 expr: | histogram_quantile(0.99, sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le) ) > 1 for: 5m labels: severity: warning annotations: summary: "High P99 latency on Veza API" description: "P99 latency is above 1 second for the last 5 minutes." - alert: RedisDown expr: redis_up == 0 or up{job=~"redis.*"} == 0 for: 30s labels: severity: critical annotations: summary: "Redis is unreachable" description: "Redis has been unreachable for more than 30 seconds." - alert: PostgresPoolHigh expr: | (veza_db_pool_in_use{job="veza-backend"} / veza_db_pool_max_open_connections{job="veza-backend"}) > 0.8 for: 5m labels: severity: warning annotations: summary: "PostgreSQL pool usage above 80%" description: "DB connection pool usage exceeds 80%." - alert: DiskSpaceLow expr: | (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1 for: 5m labels: severity: critical annotations: summary: "Disk space below 10%" description: "Free disk space is less than 10% on root filesystem."