groups: - name: veza_backend_critical interval: 30s rules: # Circuit Breaker Open - alert: VezaCircuitBreakerOpen expr: veza_circuit_breaker_state == 2 for: 5m labels: severity: critical component: circuit_breaker annotations: summary: "Circuit breaker ouvert depuis plus de 5 minutes" description: "Circuit breaker '{{ $labels.circuit_breaker_name }}' est en état OPEN depuis {{ $for }}. Les requêtes vers ce service sont rejetées." runbook: "docs/runbooks/circuit_breaker_open.md" # DB Pool > 80% de capacité # MaxOpenConns configuré à 25 dans internal/config/config.go - alert: VezaDBPoolHighUsage expr: | ( veza_db_pool_open_connections / 25 > 0.8 ) OR ( veza_db_pool_open_connections > 20 ) for: 5m labels: severity: warning component: database annotations: summary: "DB pool utilisation > 80%" description: "Pool de connexions DB utilise {{ $value | humanizePercentage }} de sa capacité ({{ $value }} connexions ouvertes)." runbook: "docs/runbooks/db_down.md" # DB Pool épuisé (wait count augmente) - alert: VezaDBPoolExhausted expr: | rate(veza_db_pool_wait_count_total[5m]) > 0.1 for: 2m labels: severity: critical component: database annotations: summary: "DB pool épuisé - connexions en attente" description: "Le pool DB est saturé. Taux d'attente: {{ $value | humanize }} requêtes/seconde." runbook: "docs/runbooks/db_down.md" - name: veza_backend_errors interval: 30s rules: # Taux erreurs 5xx élevé - alert: VezaHigh5xxRate expr: | ( sum(rate(veza_gin_http_requests_total{status=~"5.."}[5m])) / sum(rate(veza_gin_http_requests_total[5m])) ) > 0.05 for: 5m labels: severity: warning component: api annotations: summary: "Taux erreurs 5xx > 5%" description: "{{ $value | humanizePercentage }} des requêtes retournent 5xx sur les 5 dernières minutes." runbook: "docs/runbooks/high_error_rate.md" # Erreurs 5xx absolues élevées - alert: VezaHigh5xxAbsolute expr: | sum(rate(veza_gin_http_requests_total{status=~"5.."}[5m])) > 10 for: 2m labels: severity: critical component: api annotations: summary: "Plus de 10 erreurs 5xx/seconde" description: "{{ $value | humanize }} erreurs 5xx/seconde détectées." runbook: "docs/runbooks/high_error_rate.md" - name: veza_backend_latency interval: 30s rules: # Latence élevée endpoints critiques # P95 > 1s pour /api/v1/tracks, /api/v1/auth/login, /api/v1/upload - alert: VezaHighLatencyCriticalEndpoints expr: | ( histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{ path=~"/api/v1/(tracks|auth/login|upload).*" }[5m])) by (le) ) > 1.0 ) OR ( histogram_quantile(0.95, sum(rate(veza_gin_http_request_duration_seconds_bucket{ path=~"/api/v1/tracks.*" }[5m])) by (le) ) > 1.0 ) for: 5m labels: severity: warning component: api annotations: summary: "Latence P95 > 1s sur endpoints critiques" description: "Latence P95: {{ $value }}s sur endpoints critiques." runbook: "docs/runbooks/high_latency.md" # Latence P99 très élevée - alert: VezaVeryHighLatency expr: | histogram_quantile(0.99, sum(rate(veza_gin_http_request_duration_seconds_bucket[5m])) by (le) ) > 5.0 for: 3m labels: severity: warning component: api annotations: summary: "Latence P99 > 5s" description: "Latence P99: {{ $value }}s (très élevée)." runbook: "docs/runbooks/high_latency.md" - name: veza_backend_health interval: 30s rules: # Readiness check failed - alert: VezaReadinessFailed expr: | up{job="veza-backend-api"} == 0 for: 1m labels: severity: critical component: health annotations: summary: "Service veza-backend-api down" description: "Le service ne répond plus (readiness check failed)." runbook: "docs/runbooks/service_down.md" # Health check degraded - alert: VezaHealthDegraded expr: | veza_health_check_status < 1 for: 10m labels: severity: warning component: health annotations: summary: "Service en mode dégradé" description: "Un ou plusieurs services optionnels sont down (Redis/RabbitMQ)." runbook: "docs/runbooks/service_degraded.md"