veza/config/prometheus/alert_rules_v1.yml

68 lines
2.2 KiB
YAML

# v0.952: Prometheus alert rules for Veza v1
# API down, error >5%, P99>1s, Redis, PG>80%, disk>90%
groups:
- name: veza_v1_critical
rules:
- alert: APIDown
expr: up{job=~"veza-backend.*"} == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Veza API {{ $labels.instance }} is down"
description: "API has been unreachable for more than 30 seconds."
- alert: HighErrorRate
expr: |
(
sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m]))
/ sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))
) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on Veza API"
description: "5xx error rate is above 5% for the last 5 minutes."
- alert: HighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le)
) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High P99 latency on Veza API"
description: "P99 latency is above 1 second for the last 5 minutes."
- alert: RedisDown
expr: redis_up == 0 or up{job=~"redis.*"} == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Redis is unreachable"
description: "Redis has been unreachable for more than 30 seconds."
- alert: PostgresPoolHigh
expr: |
(veza_db_pool_in_use{job="veza-backend"} / veza_db_pool_max_open_connections{job="veza-backend"}) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL pool usage above 80%"
description: "DB connection pool usage exceeds 80%."
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space below 10%"
description: "Free disk space is less than 10% on root filesystem."