Some checks failed
Veza CI / Rust (Stream Server) (push) Successful in 5m21s
Security Scan / Secret Scanning (gitleaks) (push) Failing after 54s
Veza CI / Backend (Go) (push) Failing after 8m27s
Veza CI / Notify on failure (push) Successful in 6s
E2E Playwright / e2e (full) (push) Failing after 12m42s
Veza CI / Frontend (Web) (push) Successful in 15m49s
Four-node distributed MinIO cluster, single erasure set EC:2, tolerates 2 simultaneous node losses. 50% storage efficiency. Pinned to RELEASE.2025-09-07T16-13-09Z to match docker-compose so dev/prod parity is preserved. - infra/ansible/roles/minio_distributed/ : install pinned binary, systemd unit pointed at MINIO_VOLUMES with bracket-expansion form, EC:2 forced via MINIO_STORAGE_CLASS_STANDARD. Vault assertion blocks shipping placeholder credentials to staging/prod. - bucket init : creates veza-prod-tracks, enables versioning, applies lifecycle.json (30d noncurrent expiry + 7d abort-multipart). Cold-tier transition ready but inert until minio_remote_tier_name is set. - infra/ansible/playbooks/minio_distributed.yml : provisions the 4 containers, applies common baseline + role. - infra/ansible/inventory/lab.yml : new minio_nodes group. - infra/ansible/tests/test_minio_resilience.sh : kill 2 nodes, verify EC:2 reconstruction (read OK + checksum matches), restart, wait for self-heal. - scripts/minio-migrate-from-single.sh : mc mirror --preserve from the single-node bucket to the new cluster, count-verifies, prints rollout next-steps. - config/prometheus/alert_rules.yml : MinIODriveOffline (warn) + MinIONodesUnreachable (page) — page fires at >= 2 nodes unreachable because that's the redundancy ceiling for EC:2. - docs/ENV_VARIABLES.md §12 : MinIO migration cross-ref. Acceptance (Day 12) : EC:2 survives 2 concurrent kills + self-heals. Lab apply pending. No backend code change — interface stays AWS S3. W3 progress : Redis Sentinel ✓ (Day 11), MinIO distribué ✓ (this), CDN ⏳ Day 13, DMCA ⏳ Day 14, embed ⏳ Day 15. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
159 lines
6 KiB
Bash
Executable file
159 lines
6 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# test_minio_resilience.sh — validate distributed MinIO survives 2
|
|
# simultaneous node losses (EC:2 acceptance criterion).
|
|
#
|
|
# Sequence :
|
|
# 1. upload a 100 MB random file to veza-prod-tracks
|
|
# 2. stop 2 of the 4 minio containers
|
|
# 3. read the file back through a surviving node — must succeed
|
|
# 4. restart the stopped containers
|
|
# 5. wait for self-heal
|
|
# 6. assert all 4 nodes report healthy
|
|
#
|
|
# v1.0.9 W3 Day 12 — acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 3
|
|
# day 12: "EC4+2 résiste à 2 nœud kills, dashboard MinIO healthcheck vert".
|
|
#
|
|
# Usage:
|
|
# MINIO_ROOT_USER=... MINIO_ROOT_PASSWORD=... \
|
|
# bash infra/ansible/tests/test_minio_resilience.sh
|
|
#
|
|
# Exit codes:
|
|
# 0 — survived 2-node loss, self-heal completed
|
|
# 1 — cluster not healthy at start
|
|
# 2 — read failed during 2-node loss (EC:2 didn't deliver)
|
|
# 3 — required tool missing
|
|
# 4 — self-heal didn't complete within timeout
|
|
set -euo pipefail
|
|
|
|
CONTAINERS=(minio-1 minio-2 minio-3 minio-4)
|
|
KILL=(${KILL_NODES:-minio-2 minio-3})
|
|
BUCKET=${BUCKET:-veza-prod-tracks}
|
|
TEST_OBJECT_SIZE_MB=${TEST_OBJECT_SIZE_MB:-100}
|
|
HEAL_TIMEOUT_SECONDS=${HEAL_TIMEOUT_SECONDS:-300}
|
|
MINIO_ROOT_USER=${MINIO_ROOT_USER:-?}
|
|
MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-?}
|
|
|
|
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
|
|
fail() { log "FAIL: $*"; exit "${2:-2}"; }
|
|
|
|
require() {
|
|
command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3
|
|
}
|
|
|
|
require incus
|
|
require date
|
|
require dd
|
|
|
|
if [ "$MINIO_ROOT_USER" = "?" ] || [ "$MINIO_ROOT_PASSWORD" = "?" ]; then
|
|
fail "MINIO_ROOT_USER and MINIO_ROOT_PASSWORD env vars are required" 3
|
|
fi
|
|
|
|
# Helper : run mc inside a chosen surviving container so we don't
|
|
# need mc on the host. Each container has /usr/local/bin/mc.
|
|
mc_in() {
|
|
local ct=$1; shift
|
|
incus exec "$ct" -- /usr/local/bin/mc "$@"
|
|
}
|
|
|
|
# Helper : (re-)set the alias on the chosen container.
|
|
mc_alias() {
|
|
local ct=$1
|
|
mc_in "$ct" alias set veza-local "http://localhost:9000" \
|
|
"$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD" >/dev/null
|
|
}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 0. Sanity — cluster healthy at start.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 0: pre-flight — cluster health on minio-1"
|
|
mc_alias minio-1
|
|
admin_info=$(mc_in minio-1 admin info veza-local 2>&1 || true)
|
|
log "admin info:"
|
|
echo "$admin_info" | sed 's/^/ /' >&2
|
|
|
|
if echo "$admin_info" | grep -qiE "offline|unreachable"; then
|
|
fail "cluster reports offline drives at start — refusing to test from a degraded baseline" 1
|
|
fi
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 1. Upload test object.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 1: generating + uploading ${TEST_OBJECT_SIZE_MB}MB test object"
|
|
incus exec minio-1 -- bash -c "dd if=/dev/urandom of=/tmp/resilience-test.bin bs=1M count=${TEST_OBJECT_SIZE_MB} status=none"
|
|
src_sha=$(incus exec minio-1 -- sha256sum /tmp/resilience-test.bin | awk '{print $1}')
|
|
mc_in minio-1 cp /tmp/resilience-test.bin "veza-local/${BUCKET}/resilience-test.bin"
|
|
log "source SHA-256: $src_sha"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 2. Stop 2 nodes — simulate concurrent failures.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 2: stopping ${KILL[*]} (concurrent failure simulation)"
|
|
for ct in "${KILL[@]}"; do
|
|
incus stop --force "$ct" &
|
|
done
|
|
wait
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 3. Read back through a surviving node — EC:2 must reconstruct.
|
|
# -----------------------------------------------------------------------------
|
|
survivors=()
|
|
for ct in "${CONTAINERS[@]}"; do
|
|
for k in "${KILL[@]}"; do [ "$ct" = "$k" ] && continue 2; done
|
|
survivors+=("$ct")
|
|
done
|
|
read_via=${survivors[0]}
|
|
|
|
log "step 3: reading back via $read_via (EC:2 should reconstruct)"
|
|
mc_alias "$read_via"
|
|
sleep 5 # give MinIO a moment to mark the killed nodes offline
|
|
if ! mc_in "$read_via" cp "veza-local/${BUCKET}/resilience-test.bin" /tmp/resilience-readback.bin; then
|
|
fail "read failed during 2-node loss — EC:2 did not deliver the redundancy promise" 2
|
|
fi
|
|
read_sha=$(incus exec "$read_via" -- sha256sum /tmp/resilience-readback.bin | awk '{print $1}')
|
|
|
|
if [ "$src_sha" != "$read_sha" ]; then
|
|
fail "checksum mismatch: source=$src_sha read=$read_sha — silent corruption during reconstruction" 2
|
|
fi
|
|
log "checksum matches under degraded mode"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 4. Restart the stopped nodes.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 4: restarting ${KILL[*]}"
|
|
for ct in "${KILL[@]}"; do
|
|
incus start "$ct" &
|
|
done
|
|
wait
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 5. Wait for self-heal.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 5: waiting for self-heal (timeout ${HEAL_TIMEOUT_SECONDS}s)"
|
|
deadline=$(( $(date +%s) + HEAL_TIMEOUT_SECONDS ))
|
|
healed=0
|
|
while [ "$(date +%s)" -lt "$deadline" ]; do
|
|
mc_alias minio-1 || { sleep 5; continue; }
|
|
info=$(mc_in minio-1 admin info veza-local 2>&1 || true)
|
|
if ! echo "$info" | grep -qiE "offline|unreachable" && \
|
|
echo "$info" | grep -qE "[Oo]nline.*(4|four)"; then
|
|
healed=1
|
|
break
|
|
fi
|
|
sleep 5
|
|
done
|
|
|
|
if [ "$healed" -ne 1 ]; then
|
|
log "final admin info:"
|
|
mc_in minio-1 admin info veza-local 2>&1 | sed 's/^/ /' >&2 || true
|
|
fail "self-heal did not complete within ${HEAL_TIMEOUT_SECONDS}s" 4
|
|
fi
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# 6. Cleanup.
|
|
# -----------------------------------------------------------------------------
|
|
log "step 6: cleanup test object"
|
|
mc_in minio-1 rm "veza-local/${BUCKET}/resilience-test.bin" || true
|
|
incus exec minio-1 -- rm -f /tmp/resilience-test.bin /tmp/resilience-readback.bin || true
|
|
|
|
log "PASS: cluster survived ${#KILL[@]}-node loss + self-healed within budget"
|
|
exit 0
|