groups: - name: veza_critical rules: - alert: ServiceDown expr: up == 0 for: 30s labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "{{ $labels.instance }} has been down for more than 30 seconds." - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "High error rate on {{ $labels.job }}" description: "Error rate is above 5% for the last 5 minutes." - alert: HighLatencyP99 expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "High P99 latency on {{ $labels.job }}" description: "P99 latency is above 2 seconds for the last 5 minutes." - alert: RedisUnreachable expr: redis_up == 0 for: 30s labels: severity: critical annotations: summary: "Redis is unreachable" description: "Redis has been unreachable for more than 30 seconds."