68 lines
2.2 KiB
YAML
68 lines
2.2 KiB
YAML
# v0.952: Prometheus alert rules for Veza v1
|
|
# API down, error >5%, P99>1s, Redis, PG>80%, disk>90%
|
|
|
|
groups:
|
|
- name: veza_v1_critical
|
|
rules:
|
|
- alert: APIDown
|
|
expr: up{job=~"veza-backend.*"} == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Veza API {{ $labels.instance }} is down"
|
|
description: "API has been unreachable for more than 30 seconds."
|
|
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(veza_gin_http_requests_total{job="veza-backend",status=~"5.."}[5m]))
|
|
/ sum(rate(veza_gin_http_requests_total{job="veza-backend"}[5m]))
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate on Veza API"
|
|
description: "5xx error rate is above 5% for the last 5 minutes."
|
|
|
|
- alert: HighLatencyP99
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(veza_gin_http_request_duration_seconds_bucket{job="veza-backend"}[5m])) by (le)
|
|
) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High P99 latency on Veza API"
|
|
description: "P99 latency is above 1 second for the last 5 minutes."
|
|
|
|
- alert: RedisDown
|
|
expr: redis_up == 0 or up{job=~"redis.*"} == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is unreachable"
|
|
description: "Redis has been unreachable for more than 30 seconds."
|
|
|
|
- alert: PostgresPoolHigh
|
|
expr: |
|
|
(veza_db_pool_in_use{job="veza-backend"} / veza_db_pool_max_open_connections{job="veza-backend"}) > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL pool usage above 80%"
|
|
description: "DB connection pool usage exceeds 80%."
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: |
|
|
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disk space below 10%"
|
|
description: "Free disk space is less than 10% on root filesystem."
|