INF-08: Alert rules for service_down, high_error_rate (>5%), high_latency (P99>2s), and redis_unreachable. Enabled rule_files in prometheus.yml.
38 lines
1.2 KiB
YAML
38 lines
1.2 KiB
YAML
groups:
|
|
- name: veza_critical
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "{{ $labels.instance }} has been down for more than 30 seconds."
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate on {{ $labels.job }}"
|
|
description: "Error rate is above 5% for the last 5 minutes."
|
|
|
|
- alert: HighLatencyP99
|
|
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High P99 latency on {{ $labels.job }}"
|
|
description: "P99 latency is above 2 seconds for the last 5 minutes."
|
|
|
|
- alert: RedisUnreachable
|
|
expr: redis_up == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is unreachable"
|
|
description: "Redis has been unreachable for more than 30 seconds."
|