feat(monitoring): add Prometheus alerting rules for critical conditions

INF-08: Alert rules for service_down, high_error_rate (>5%),
high_latency (P99>2s), and redis_unreachable. Enabled rule_files
in prometheus.yml.
This commit is contained in:
senke 2026-02-22 17:36:07 +01:00
parent 3e0e1b5286
commit 6b25ccc9da
2 changed files with 39 additions and 2 deletions

View file

@ -3,8 +3,7 @@ global:
evaluation_interval: 15s
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "prometheus/alert_rules.yml"
scrape_configs:
- job_name: 'prometheus'

View file

@ -0,0 +1,38 @@
groups:
- name: veza_critical
rules:
- alert: ServiceDown
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.instance }} has been down for more than 30 seconds."
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "Error rate is above 5% for the last 5 minutes."
- alert: HighLatencyP99
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High P99 latency on {{ $labels.job }}"
description: "P99 latency is above 2 seconds for the last 5 minutes."
- alert: RedisUnreachable
expr: redis_up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Redis is unreachable"
description: "Redis has been unreachable for more than 30 seconds."