244 lines
8.2 KiB
Bash
Executable file
244 lines
8.2 KiB
Bash
Executable file
#!/bin/bash
|
|
# DB Down Drill - Script de validation opérationnelle
|
|
# Objectif: Vérifier que /readyz retourne 503 + status "not_ready" quand DB est down
|
|
# Usage: ./scripts/ops_drills/db_down_drill.sh [API_URL] [PROMETHEUS_URL]
|
|
|
|
set -e
|
|
|
|
API_URL="${1:-http://localhost:8080}"
|
|
PROMETHEUS_URL="${2:-http://localhost:9090}"
|
|
DRILL_LOG="/tmp/db_down_drill_$(date +%Y%m%d_%H%M%S).log"
|
|
|
|
echo "========================================="
|
|
echo "DB Down Drill - $(date)"
|
|
echo "========================================="
|
|
echo "API URL: $API_URL"
|
|
echo "Prometheus URL: $PROMETHEUS_URL"
|
|
echo "Log: $DRILL_LOG"
|
|
echo ""
|
|
|
|
# Fonction helper pour logger
|
|
log() {
|
|
echo "[$(date +%H:%M:%S)] $1" | tee -a "$DRILL_LOG"
|
|
}
|
|
|
|
# Fonction helper pour vérifier métrique Prometheus
|
|
check_prometheus_metric() {
|
|
local metric=$1
|
|
local expected_value=$2
|
|
local query="query=$metric"
|
|
|
|
log "Vérification métrique: $metric"
|
|
local result=$(curl -s "$PROMETHEUS_URL/api/v1/query?$query" | jq -r '.data.result[0].value[1]' 2>/dev/null || echo "N/A")
|
|
|
|
if [ "$result" != "N/A" ] && [ "$result" != "null" ]; then
|
|
log " ✓ Métrique trouvée: $metric = $result"
|
|
if [ -n "$expected_value" ]; then
|
|
if [ "$result" == "$expected_value" ]; then
|
|
log " ✓ Valeur attendue: $expected_value"
|
|
return 0
|
|
else
|
|
log " ⚠ Valeur inattendue: attendu $expected_value, obtenu $result"
|
|
return 1
|
|
fi
|
|
fi
|
|
return 0
|
|
else
|
|
log " ✗ Métrique non trouvée ou non disponible"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Étape 1: État initial (baseline)
|
|
log "=== ÉTAPE 1: État initial (baseline) ==="
|
|
log "Vérification /readyz avant coupure DB..."
|
|
initial_response=$(curl -s -w "\n%{http_code}" "$API_URL/readyz" || echo -e "\n000")
|
|
initial_body=$(echo "$initial_response" | head -n -1)
|
|
initial_status=$(echo "$initial_response" | tail -n 1)
|
|
|
|
if [ "$initial_status" == "200" ]; then
|
|
log "✓ /readyz retourne 200 (état normal)"
|
|
initial_status_value=$(echo "$initial_body" | jq -r '.data.status' 2>/dev/null || echo "unknown")
|
|
log " Status: $initial_status_value"
|
|
else
|
|
log "⚠ /readyz retourne $initial_status (peut être normal si DB déjà down)"
|
|
fi
|
|
|
|
# Vérifier métriques DB initiales
|
|
log ""
|
|
log "Métriques DB initiales:"
|
|
check_prometheus_metric "veza_db_pool_open_connections" || true
|
|
check_prometheus_metric "veza_db_pool_in_use" || true
|
|
|
|
echo ""
|
|
log "=== ÉTAPE 2: Simulation DB Down ==="
|
|
log ""
|
|
log "OPTIONS:"
|
|
log " 1) Arrêter PostgreSQL (nécessite sudo)"
|
|
log " 2) Modifier DATABASE_URL pour DSN invalide (recommandé pour staging)"
|
|
log " 3) Utiliser firewall pour bloquer port 5432"
|
|
log ""
|
|
read -p "Choisir option (1/2/3) ou 'skip' pour continuer avec DB actuelle: " option
|
|
|
|
case $option in
|
|
1)
|
|
log "Arrêt PostgreSQL..."
|
|
if command -v systemctl &> /dev/null; then
|
|
sudo systemctl stop postgresql
|
|
DB_STOPPED=true
|
|
elif command -v docker &> /dev/null; then
|
|
docker stop veza-postgres 2>/dev/null || docker stop $(docker ps -q --filter "ancestor=postgres") 2>/dev/null || true
|
|
DB_STOPPED=true
|
|
else
|
|
log "⚠ Impossible d'arrêter PostgreSQL automatiquement"
|
|
log " Arrêtez PostgreSQL manuellement et appuyez sur Entrée"
|
|
read
|
|
DB_STOPPED=true
|
|
fi
|
|
;;
|
|
2)
|
|
log "⚠ Pour modifier DATABASE_URL, redémarrez l'application avec:"
|
|
log " DATABASE_URL=postgresql://invalid:invalid@invalid:5432/invalid"
|
|
log " Puis appuyez sur Entrée pour continuer..."
|
|
read
|
|
DB_STOPPED=true
|
|
;;
|
|
3)
|
|
log "⚠ Pour bloquer le port 5432, utilisez:"
|
|
log " sudo iptables -A OUTPUT -p tcp --dport 5432 -j DROP"
|
|
log " Puis appuyez sur Entrée pour continuer..."
|
|
read
|
|
DB_STOPPED=true
|
|
;;
|
|
skip)
|
|
log "Skip - Utilisation DB actuelle (peut être déjà down)"
|
|
DB_STOPPED=false
|
|
;;
|
|
*)
|
|
log "Option invalide, skip"
|
|
DB_STOPPED=false
|
|
;;
|
|
esac
|
|
|
|
# Attendre quelques secondes pour que la connexion expire
|
|
log ""
|
|
log "Attente 5 secondes pour propagation..."
|
|
sleep 5
|
|
|
|
# Étape 3: Vérifier /readyz après coupure
|
|
log ""
|
|
log "=== ÉTAPE 3: Vérification /readyz après coupure DB ==="
|
|
readyz_response=$(curl -s -w "\n%{http_code}" "$API_URL/readyz" || echo -e "\n000")
|
|
readyz_body=$(echo "$readyz_response" | head -n -1)
|
|
readyz_status=$(echo "$readyz_response" | tail -n 1)
|
|
|
|
log "Réponse /readyz:"
|
|
log " HTTP Status: $readyz_status"
|
|
log " Body: $readyz_body"
|
|
|
|
# Vérifications
|
|
SUCCESS=true
|
|
|
|
if [ "$readyz_status" == "503" ]; then
|
|
log "✓ HTTP Status = 503 (Service Unavailable) - CORRECT"
|
|
else
|
|
log "✗ HTTP Status = $readyz_status (attendu: 503) - ÉCHEC"
|
|
SUCCESS=false
|
|
fi
|
|
|
|
status_value=$(echo "$readyz_body" | jq -r '.data.status' 2>/dev/null || echo "unknown")
|
|
if [ "$status_value" == "not_ready" ]; then
|
|
log "✓ Status = 'not_ready' - CORRECT"
|
|
else
|
|
log "✗ Status = '$status_value' (attendu: 'not_ready') - ÉCHEC"
|
|
SUCCESS=false
|
|
fi
|
|
|
|
db_check_status=$(echo "$readyz_body" | jq -r '.data.checks.database.status' 2>/dev/null || echo "unknown")
|
|
if [ "$db_check_status" == "error" ]; then
|
|
log "✓ DB check status = 'error' - CORRECT"
|
|
else
|
|
log "⚠ DB check status = '$db_check_status' (attendu: 'error')"
|
|
fi
|
|
|
|
# Étape 4: Vérifier métriques Prometheus
|
|
log ""
|
|
log "=== ÉTAPE 4: Vérification métriques Prometheus ==="
|
|
|
|
# Vérifier que les métriques DB sont toujours exposées (même si DB down)
|
|
log "Vérification métriques DB pool..."
|
|
check_prometheus_metric "veza_db_pool_open_connections" || log " ⚠ Métrique non disponible (peut être normal)"
|
|
|
|
# Vérifier wait count (devrait augmenter si pool saturé)
|
|
log ""
|
|
log "Vérification wait count (devrait augmenter si pool saturé)..."
|
|
check_prometheus_metric "veza_db_pool_wait_count_total" || log " ⚠ Métrique non disponible"
|
|
|
|
# Étape 5: Vérifier alertes Prometheus
|
|
log ""
|
|
log "=== ÉTAPE 5: Vérification alertes Prometheus ==="
|
|
alerts=$(curl -s "$PROMETHEUS_URL/api/v1/alerts" | jq -r '.data.alerts[] | select(.labels.alertname == "VezaDBPoolExhausted" or .labels.alertname == "VezaReadinessFailed") | .labels.alertname' 2>/dev/null || echo "")
|
|
|
|
if [ -n "$alerts" ]; then
|
|
log "✓ Alertes déclenchées:"
|
|
echo "$alerts" | while read alert; do
|
|
log " - $alert"
|
|
done
|
|
else
|
|
log "⚠ Aucune alerte déclenchée (peut être normal si seuils non atteints)"
|
|
log " Vérifier manuellement: $PROMETHEUS_URL/alerts"
|
|
fi
|
|
|
|
# Étape 6: Restauration
|
|
log ""
|
|
log "=== ÉTAPE 6: Restauration ==="
|
|
if [ "$DB_STOPPED" == "true" ]; then
|
|
read -p "Restaurer DB? (y/n): " restore
|
|
if [ "$restore" == "y" ]; then
|
|
case $option in
|
|
1)
|
|
if command -v systemctl &> /dev/null; then
|
|
sudo systemctl start postgresql
|
|
log "✓ PostgreSQL redémarré"
|
|
elif command -v docker &> /dev/null; then
|
|
docker start veza-postgres 2>/dev/null || log "⚠ Redémarrer manuellement le container PostgreSQL"
|
|
fi
|
|
;;
|
|
3)
|
|
log "Débloquer port 5432:"
|
|
log " sudo iptables -D OUTPUT -p tcp --dport 5432 -j DROP"
|
|
;;
|
|
esac
|
|
|
|
log "Attente 10 secondes pour reconnexion..."
|
|
sleep 10
|
|
|
|
log "Vérification /readyz après restauration..."
|
|
restored_response=$(curl -s "$API_URL/readyz")
|
|
restored_status=$(echo "$restored_response" | jq -r '.data.status' 2>/dev/null || echo "unknown")
|
|
log " Status: $restored_status"
|
|
|
|
if [ "$restored_status" == "ready" ] || [ "$restored_status" == "degraded" ]; then
|
|
log "✓ Service restauré"
|
|
else
|
|
log "⚠ Service pas encore restauré (attendre plus longtemps)"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Résumé
|
|
log ""
|
|
log "========================================="
|
|
log "RÉSUMÉ DU DRILL"
|
|
log "========================================="
|
|
if [ "$SUCCESS" == "true" ]; then
|
|
log "✓ DRILL RÉUSSI"
|
|
log " - /readyz retourne 503 quand DB down"
|
|
log " - Status = 'not_ready'"
|
|
log " - Métriques exposées"
|
|
exit 0
|
|
else
|
|
log "✗ DRILL ÉCHOUÉ"
|
|
log " Vérifier les points d'échec ci-dessus"
|
|
exit 1
|
|
fi
|