feat(monitoring): add Prometheus alerting rules for critical conditions
INF-08: Alert rules for service_down, high_error_rate (>5%), high_latency (P99>2s), and redis_unreachable. Enabled rule_files in prometheus.yml.
This commit is contained in:
parent
3e0e1b5286
commit
6b25ccc9da
2 changed files with 39 additions and 2 deletions
|
|
@ -3,8 +3,7 @@ global:
|
|||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
# - "first_rules.yml"
|
||||
# - "second_rules.yml"
|
||||
- "prometheus/alert_rules.yml"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
|
|
|
|||
38
config/prometheus/alert_rules.yml
Normal file
38
config/prometheus/alert_rules.yml
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
groups:
|
||||
- name: veza_critical
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "{{ $labels.instance }} has been down for more than 30 seconds."
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.job }}"
|
||||
description: "Error rate is above 5% for the last 5 minutes."
|
||||
|
||||
- alert: HighLatencyP99
|
||||
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High P99 latency on {{ $labels.job }}"
|
||||
description: "P99 latency is above 2 seconds for the last 5 minutes."
|
||||
|
||||
- alert: RedisUnreachable
|
||||
expr: redis_up == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is unreachable"
|
||||
description: "Redis has been unreachable for more than 30 seconds."
|
||||
Loading…
Reference in a new issue