veza/infra/ansible/tests/test_minio_resilience.sh

#!/usr/bin/env bash
# test_minio_resilience.sh — validate distributed MinIO survives 2
# simultaneous node losses (EC:2 acceptance criterion).
#
# Sequence :
#   1. upload a 100 MB random file to veza-prod-tracks
#   2. stop 2 of the 4 minio containers
#   3. read the file back through a surviving node — must succeed
#   4. restart the stopped containers
#   5. wait for self-heal
#   6. assert all 4 nodes report healthy
#
# v1.0.9 W3 Day 12 — acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 3
# day 12: "EC4+2 résiste à 2 nœud kills, dashboard MinIO healthcheck vert".
#
# Usage:
#   MINIO_ROOT_USER=... MINIO_ROOT_PASSWORD=... \
#     bash infra/ansible/tests/test_minio_resilience.sh
#
# Exit codes:
#   0  — survived 2-node loss, self-heal completed
#   1  — cluster not healthy at start
#   2  — read failed during 2-node loss (EC:2 didn't deliver)
#   3  — required tool missing
#   4  — self-heal didn't complete within timeout
set -euo pipefail

CONTAINERS=(minio-1 minio-2 minio-3 minio-4)
KILL=(${KILL_NODES:-minio-2 minio-3})
BUCKET=${BUCKET:-veza-prod-tracks}
TEST_OBJECT_SIZE_MB=${TEST_OBJECT_SIZE_MB:-100}
HEAL_TIMEOUT_SECONDS=${HEAL_TIMEOUT_SECONDS:-300}
MINIO_ROOT_USER=${MINIO_ROOT_USER:-?}
MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-?}

log()  { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
fail() { log "FAIL: $*"; exit "${2:-2}"; }

require() {
  command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3
}

require incus
require date
require dd

if [ "$MINIO_ROOT_USER" = "?" ] || [ "$MINIO_ROOT_PASSWORD" = "?" ]; then
  fail "MINIO_ROOT_USER and MINIO_ROOT_PASSWORD env vars are required" 3
fi

# Helper : run mc inside a chosen surviving container so we don't
# need mc on the host. Each container has /usr/local/bin/mc.
mc_in() {
  local ct=$1; shift
  incus exec "$ct" -- /usr/local/bin/mc "$@"
}

# Helper : (re-)set the alias on the chosen container.
mc_alias() {
  local ct=$1
  mc_in "$ct" alias set veza-local "http://localhost:9000" \
    "$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD" >/dev/null
}

# -----------------------------------------------------------------------------
# 0. Sanity — cluster healthy at start.
# -----------------------------------------------------------------------------
log "step 0: pre-flight — cluster health on minio-1"
mc_alias minio-1
admin_info=$(mc_in minio-1 admin info veza-local 2>&1 || true)
log "admin info:"
echo "$admin_info" | sed 's/^/    /' >&2

if echo "$admin_info" | grep -qiE "offline|unreachable"; then
  fail "cluster reports offline drives at start — refusing to test from a degraded baseline" 1
fi

# -----------------------------------------------------------------------------
# 1. Upload test object.
# -----------------------------------------------------------------------------
log "step 1: generating + uploading ${TEST_OBJECT_SIZE_MB}MB test object"
incus exec minio-1 -- bash -c "dd if=/dev/urandom of=/tmp/resilience-test.bin bs=1M count=${TEST_OBJECT_SIZE_MB} status=none"
src_sha=$(incus exec minio-1 -- sha256sum /tmp/resilience-test.bin | awk '{print $1}')
mc_in minio-1 cp /tmp/resilience-test.bin "veza-local/${BUCKET}/resilience-test.bin"
log "source SHA-256: $src_sha"

# -----------------------------------------------------------------------------
# 2. Stop 2 nodes — simulate concurrent failures.
# -----------------------------------------------------------------------------
log "step 2: stopping ${KILL[*]} (concurrent failure simulation)"
for ct in "${KILL[@]}"; do
  incus stop --force "$ct" &
done
wait

# -----------------------------------------------------------------------------
# 3. Read back through a surviving node — EC:2 must reconstruct.
# -----------------------------------------------------------------------------
survivors=()
for ct in "${CONTAINERS[@]}"; do
  for k in "${KILL[@]}"; do [ "$ct" = "$k" ] && continue 2; done
  survivors+=("$ct")
done
read_via=${survivors[0]}

log "step 3: reading back via $read_via (EC:2 should reconstruct)"
mc_alias "$read_via"
sleep 5  # give MinIO a moment to mark the killed nodes offline
if ! mc_in "$read_via" cp "veza-local/${BUCKET}/resilience-test.bin" /tmp/resilience-readback.bin; then
  fail "read failed during 2-node loss — EC:2 did not deliver the redundancy promise" 2
fi
read_sha=$(incus exec "$read_via" -- sha256sum /tmp/resilience-readback.bin | awk '{print $1}')

if [ "$src_sha" != "$read_sha" ]; then
  fail "checksum mismatch: source=$src_sha read=$read_sha — silent corruption during reconstruction" 2
fi
log "checksum matches under degraded mode"

# -----------------------------------------------------------------------------
# 4. Restart the stopped nodes.
# -----------------------------------------------------------------------------
log "step 4: restarting ${KILL[*]}"
for ct in "${KILL[@]}"; do
  incus start "$ct" &
done
wait

# -----------------------------------------------------------------------------
# 5. Wait for self-heal.
# -----------------------------------------------------------------------------
log "step 5: waiting for self-heal (timeout ${HEAL_TIMEOUT_SECONDS}s)"
deadline=$(( $(date +%s) + HEAL_TIMEOUT_SECONDS ))
healed=0
while [ "$(date +%s)" -lt "$deadline" ]; do
  mc_alias minio-1 || { sleep 5; continue; }
  info=$(mc_in minio-1 admin info veza-local 2>&1 || true)
  if ! echo "$info" | grep -qiE "offline|unreachable" && \
     echo "$info" | grep -qE "[Oo]nline.*(4|four)"; then
    healed=1
    break
  fi
  sleep 5
done

if [ "$healed" -ne 1 ]; then
  log "final admin info:"
  mc_in minio-1 admin info veza-local 2>&1 | sed 's/^/    /' >&2 || true
  fail "self-heal did not complete within ${HEAL_TIMEOUT_SECONDS}s" 4
fi

# -----------------------------------------------------------------------------
# 6. Cleanup.
# -----------------------------------------------------------------------------
log "step 6: cleanup test object"
mc_in minio-1 rm "veza-local/${BUCKET}/resilience-test.bin" || true
incus exec minio-1 -- rm -f /tmp/resilience-test.bin /tmp/resilience-readback.bin || true

log "PASS: cluster survived ${#KILL[@]}-node loss + self-healed within budget"
exit 0