veza/config/prometheus/alert_rules.yml

groups:
  - name: veza_critical
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 30 seconds."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatencyP99
        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency on {{ $labels.job }}"
          description: "P99 latency is above 2 seconds for the last 5 minutes."

      - alert: RedisUnreachable
        expr: redis_up == 0
        for: 30s
        labels:
          severity: critical
        annotations:
          summary: "Redis is unreachable"
          description: "Redis has been unreachable for more than 30 seconds."
feat(monitoring): add Prometheus alerting rules for critical conditions INF-08: Alert rules for service_down, high_error_rate (>5%), high_latency (P99>2s), and redis_unreachable. Enabled rule_files in prometheus.yml. 2026-02-22 16:36:07 +00:00			`groups:`
			`- name: veza_critical`
			`rules:`
			`- alert: ServiceDown`
			`expr: up == 0`
			`for: 30s`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Service {{ $labels.job }} is down"`
			`description: "{{ $labels.instance }} has been down for more than 30 seconds."`

			`- alert: HighErrorRate`
			`expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High error rate on {{ $labels.job }}"`
			`description: "Error rate is above 5% for the last 5 minutes."`

			`- alert: HighLatencyP99`
			`expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High P99 latency on {{ $labels.job }}"`
			`description: "P99 latency is above 2 seconds for the last 5 minutes."`

			`- alert: RedisUnreachable`
			`expr: redis_up == 0`
			`for: 30s`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Redis is unreachable"`
			`description: "Redis has been unreachable for more than 30 seconds."`