feat(monitoring): add Prometheus alerting rules for critical conditions

INF-08: Alert rules for service_down, high_error_rate (>5%), high_latency (P99>2s), and redis_unreachable. Enabled rule_files in prometheus.yml.
2026-02-22 17:36:07 +01:00 · 2026-02-22 17:36:07 +01:00 · 6b25ccc9da
commit 6b25ccc9da
parent 3e0e1b5286
2 changed files with 39 additions and 2 deletions
--- a/config/prometheus.yml
+++ b/config/prometheus.yml
@ -3,8 +3,7 @@ global:
  evaluation_interval: 15s

 rule_files:
-  # - "first_rules.yml"
-  # - "second_rules.yml"
+  - "prometheus/alert_rules.yml"

 scrape_configs:
  - job_name: 'prometheus'
--- a/config/prometheus/alert_rules.yml
+++ b/config/prometheus/alert_rules.yml
@ -0,0 +1,38 @@
+groups:
+  - name: veza_critical
+    rules:
+      - alert: ServiceDown
+        expr: up == 0
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.job }} is down"
+          description: "{{ $labels.instance }} has been down for more than 30 seconds."
+
+      - alert: HighErrorRate
+        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High error rate on {{ $labels.job }}"
+          description: "Error rate is above 5% for the last 5 minutes."
+
+      - alert: HighLatencyP99
+        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High P99 latency on {{ $labels.job }}"
+          description: "P99 latency is above 2 seconds for the last 5 minutes."
+
+      - alert: RedisUnreachable
+        expr: redis_up == 0
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Redis is unreachable"
+          description: "Redis has been unreachable for more than 30 seconds."