diff --git a/config/prometheus.yml b/config/prometheus.yml index 9ddcf5453..db94ffa41 100644 --- a/config/prometheus.yml +++ b/config/prometheus.yml @@ -3,8 +3,7 @@ global: evaluation_interval: 15s rule_files: - # - "first_rules.yml" - # - "second_rules.yml" + - "prometheus/alert_rules.yml" scrape_configs: - job_name: 'prometheus' diff --git a/config/prometheus/alert_rules.yml b/config/prometheus/alert_rules.yml new file mode 100644 index 000000000..ea9479534 --- /dev/null +++ b/config/prometheus/alert_rules.yml @@ -0,0 +1,38 @@ +groups: + - name: veza_critical + rules: + - alert: ServiceDown + expr: up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is down" + description: "{{ $labels.instance }} has been down for more than 30 seconds." + + - alert: HighErrorRate + expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate on {{ $labels.job }}" + description: "Error rate is above 5% for the last 5 minutes." + + - alert: HighLatencyP99 + expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "High P99 latency on {{ $labels.job }}" + description: "P99 latency is above 2 seconds for the last 5 minutes." + + - alert: RedisUnreachable + expr: redis_up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Redis is unreachable" + description: "Redis has been unreachable for more than 30 seconds."