veza/veza-backend-api/ops/prometheus/alerts.yml
2025-12-16 11:23:49 -05:00

152 lines
5.1 KiB
YAML

groups:
- name: veza_backend_critical
interval: 30s
rules:
# Circuit Breaker Open
- alert: VezaCircuitBreakerOpen
expr: veza_circuit_breaker_state == 2
for: 5m
labels:
severity: critical
component: circuit_breaker
annotations:
summary: "Circuit breaker ouvert depuis plus de 5 minutes"
description: "Circuit breaker '{{ $labels.circuit_breaker_name }}' est en état OPEN depuis {{ $for }}. Les requêtes vers ce service sont rejetées."
runbook: "docs/runbooks/circuit_breaker_open.md"
# DB Pool > 80% de capacité
# MaxOpenConns configuré à 25 dans internal/config/config.go
- alert: VezaDBPoolHighUsage
expr: |
(
veza_db_pool_open_connections / 25 > 0.8
) OR (
veza_db_pool_open_connections > 20
)
for: 5m
labels:
severity: warning
component: database
annotations:
summary: "DB pool utilisation > 80%"
description: "Pool de connexions DB utilise {{ $value | humanizePercentage }} de sa capacité ({{ $value }} connexions ouvertes)."
runbook: "docs/runbooks/db_down.md"
# DB Pool épuisé (wait count augmente)
- alert: VezaDBPoolExhausted
expr: |
rate(veza_db_pool_wait_count_total[5m]) > 0.1
for: 2m
labels:
severity: critical
component: database
annotations:
summary: "DB pool épuisé - connexions en attente"
description: "Le pool DB est saturé. Taux d'attente: {{ $value | humanize }} requêtes/seconde."
runbook: "docs/runbooks/db_down.md"
- name: veza_backend_errors
interval: 30s
rules:
# Taux erreurs 5xx élevé
- alert: VezaHigh5xxRate
expr: |
(
sum(rate(veza_gin_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(veza_gin_http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: warning
component: api
annotations:
summary: "Taux erreurs 5xx > 5%"
description: "{{ $value | humanizePercentage }} des requêtes retournent 5xx sur les 5 dernières minutes."
runbook: "docs/runbooks/high_error_rate.md"
# Erreurs 5xx absolues élevées
- alert: VezaHigh5xxAbsolute
expr: |
sum(rate(veza_gin_http_requests_total{status=~"5.."}[5m])) > 10
for: 2m
labels:
severity: critical
component: api
annotations:
summary: "Plus de 10 erreurs 5xx/seconde"
description: "{{ $value | humanize }} erreurs 5xx/seconde détectées."
runbook: "docs/runbooks/high_error_rate.md"
- name: veza_backend_latency
interval: 30s
rules:
# Latence élevée endpoints critiques
# P95 > 1s pour /api/v1/tracks, /api/v1/auth/login, /api/v1/upload
- alert: VezaHighLatencyCriticalEndpoints
expr: |
(
histogram_quantile(0.95,
sum(rate(veza_gin_http_request_duration_seconds_bucket{
path=~"/api/v1/(tracks|auth/login|upload).*"
}[5m])) by (le)
) > 1.0
) OR (
histogram_quantile(0.95,
sum(rate(veza_gin_http_request_duration_seconds_bucket{
path=~"/api/v1/tracks.*"
}[5m])) by (le)
) > 1.0
)
for: 5m
labels:
severity: warning
component: api
annotations:
summary: "Latence P95 > 1s sur endpoints critiques"
description: "Latence P95: {{ $value }}s sur endpoints critiques."
runbook: "docs/runbooks/high_latency.md"
# Latence P99 très élevée
- alert: VezaVeryHighLatency
expr: |
histogram_quantile(0.99,
sum(rate(veza_gin_http_request_duration_seconds_bucket[5m])) by (le)
) > 5.0
for: 3m
labels:
severity: warning
component: api
annotations:
summary: "Latence P99 > 5s"
description: "Latence P99: {{ $value }}s (très élevée)."
runbook: "docs/runbooks/high_latency.md"
- name: veza_backend_health
interval: 30s
rules:
# Readiness check failed
- alert: VezaReadinessFailed
expr: |
up{job="veza-backend-api"} == 0
for: 1m
labels:
severity: critical
component: health
annotations:
summary: "Service veza-backend-api down"
description: "Le service ne répond plus (readiness check failed)."
runbook: "docs/runbooks/service_down.md"
# Health check degraded
- alert: VezaHealthDegraded
expr: |
veza_health_check_status < 1
for: 10m
labels:
severity: warning
component: health
annotations:
summary: "Service en mode dégradé"
description: "Un ou plusieurs services optionnels sont down (Redis/RabbitMQ)."
runbook: "docs/runbooks/service_degraded.md"