152 lines
5.1 KiB
YAML
152 lines
5.1 KiB
YAML
groups:
|
|
- name: veza_backend_critical
|
|
interval: 30s
|
|
rules:
|
|
# Circuit Breaker Open
|
|
- alert: VezaCircuitBreakerOpen
|
|
expr: veza_circuit_breaker_state == 2
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: circuit_breaker
|
|
annotations:
|
|
summary: "Circuit breaker ouvert depuis plus de 5 minutes"
|
|
description: "Circuit breaker '{{ $labels.circuit_breaker_name }}' est en état OPEN depuis {{ $for }}. Les requêtes vers ce service sont rejetées."
|
|
runbook: "docs/runbooks/circuit_breaker_open.md"
|
|
|
|
# DB Pool > 80% de capacité
|
|
# MaxOpenConns configuré à 25 dans internal/config/config.go
|
|
- alert: VezaDBPoolHighUsage
|
|
expr: |
|
|
(
|
|
veza_db_pool_open_connections / 25 > 0.8
|
|
) OR (
|
|
veza_db_pool_open_connections > 20
|
|
)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: database
|
|
annotations:
|
|
summary: "DB pool utilisation > 80%"
|
|
description: "Pool de connexions DB utilise {{ $value | humanizePercentage }} de sa capacité ({{ $value }} connexions ouvertes)."
|
|
runbook: "docs/runbooks/db_down.md"
|
|
|
|
# DB Pool épuisé (wait count augmente)
|
|
- alert: VezaDBPoolExhausted
|
|
expr: |
|
|
rate(veza_db_pool_wait_count_total[5m]) > 0.1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: database
|
|
annotations:
|
|
summary: "DB pool épuisé - connexions en attente"
|
|
description: "Le pool DB est saturé. Taux d'attente: {{ $value | humanize }} requêtes/seconde."
|
|
runbook: "docs/runbooks/db_down.md"
|
|
|
|
- name: veza_backend_errors
|
|
interval: 30s
|
|
rules:
|
|
# Taux erreurs 5xx élevé
|
|
- alert: VezaHigh5xxRate
|
|
expr: |
|
|
(
|
|
sum(rate(veza_gin_http_requests_total{status=~"5.."}[5m]))
|
|
/
|
|
sum(rate(veza_gin_http_requests_total[5m]))
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: api
|
|
annotations:
|
|
summary: "Taux erreurs 5xx > 5%"
|
|
description: "{{ $value | humanizePercentage }} des requêtes retournent 5xx sur les 5 dernières minutes."
|
|
runbook: "docs/runbooks/high_error_rate.md"
|
|
|
|
# Erreurs 5xx absolues élevées
|
|
- alert: VezaHigh5xxAbsolute
|
|
expr: |
|
|
sum(rate(veza_gin_http_requests_total{status=~"5.."}[5m])) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: api
|
|
annotations:
|
|
summary: "Plus de 10 erreurs 5xx/seconde"
|
|
description: "{{ $value | humanize }} erreurs 5xx/seconde détectées."
|
|
runbook: "docs/runbooks/high_error_rate.md"
|
|
|
|
- name: veza_backend_latency
|
|
interval: 30s
|
|
rules:
|
|
# Latence élevée endpoints critiques
|
|
# P95 > 1s pour /api/v1/tracks, /api/v1/auth/login, /api/v1/upload
|
|
- alert: VezaHighLatencyCriticalEndpoints
|
|
expr: |
|
|
(
|
|
histogram_quantile(0.95,
|
|
sum(rate(veza_gin_http_request_duration_seconds_bucket{
|
|
path=~"/api/v1/(tracks|auth/login|upload).*"
|
|
}[5m])) by (le)
|
|
) > 1.0
|
|
) OR (
|
|
histogram_quantile(0.95,
|
|
sum(rate(veza_gin_http_request_duration_seconds_bucket{
|
|
path=~"/api/v1/tracks.*"
|
|
}[5m])) by (le)
|
|
) > 1.0
|
|
)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: api
|
|
annotations:
|
|
summary: "Latence P95 > 1s sur endpoints critiques"
|
|
description: "Latence P95: {{ $value }}s sur endpoints critiques."
|
|
runbook: "docs/runbooks/high_latency.md"
|
|
|
|
# Latence P99 très élevée
|
|
- alert: VezaVeryHighLatency
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(veza_gin_http_request_duration_seconds_bucket[5m])) by (le)
|
|
) > 5.0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
component: api
|
|
annotations:
|
|
summary: "Latence P99 > 5s"
|
|
description: "Latence P99: {{ $value }}s (très élevée)."
|
|
runbook: "docs/runbooks/high_latency.md"
|
|
|
|
- name: veza_backend_health
|
|
interval: 30s
|
|
rules:
|
|
# Readiness check failed
|
|
- alert: VezaReadinessFailed
|
|
expr: |
|
|
up{job="veza-backend-api"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: health
|
|
annotations:
|
|
summary: "Service veza-backend-api down"
|
|
description: "Le service ne répond plus (readiness check failed)."
|
|
runbook: "docs/runbooks/service_down.md"
|
|
|
|
# Health check degraded
|
|
- alert: VezaHealthDegraded
|
|
expr: |
|
|
veza_health_check_status < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: health
|
|
annotations:
|
|
summary: "Service en mode dégradé"
|
|
description: "Un ou plusieurs services optionnels sont down (Redis/RabbitMQ)."
|
|
runbook: "docs/runbooks/service_degraded.md"
|