diff --git a/config/prometheus/alert_rules.yml b/config/prometheus/alert_rules.yml index b62f90140..86516ba77 100644 --- a/config/prometheus/alert_rules.yml +++ b/config/prometheus/alert_rules.yml @@ -79,3 +79,44 @@ groups: systemctl status pgbackrest-drill.timer journalctl -u pgbackrest-drill.service -n 200 runbook_url: "https://veza.fr/runbooks/backup-restore-drill-stale" + + # v1.0.9 W3 Day 12: distributed MinIO health. EC:2 tolerates 2-drive + # loss before data becomes unavailable, so the alert fires the moment + # one drive is offline — gives us margin to react before the second + # failure exhausts redundancy. + - name: veza_minio + rules: + - alert: MinIODriveOffline + # minio_node_drive_online is 0 when MinIO sees a drive as offline. + # The metric is exposed by every node (set MINIO_PROMETHEUS_AUTH_TYPE=public) + # so a single missing scrape doesn't trip the alert. + expr: min(minio_node_drive_online_total) by (server) < min(minio_node_drive_total) by (server) + for: 2m + labels: + severity: warning + page: "false" + annotations: + summary: "MinIO drive offline on {{ $labels.server }}" + description: | + One or more drives report offline on {{ $labels.server }}. EC:2 + still serves reads, but a second drive failure would cause a + data-unavailability event. Investigate within the hour. + ssh {{ $labels.server }} sudo journalctl -u minio -n 200 + runbook_url: "https://veza.fr/runbooks/minio-drive-offline" + + - alert: MinIONodesUnreachable + # > 1 node down on a 4-node EC:2 cluster = redundancy exhausted. + # Pages the on-call. (Threshold below the 2-drive tolerance because + # we want the page BEFORE we run out of room for another failure.) + expr: count(up{job="minio"} == 0) >= 2 + for: 1m + labels: + severity: critical + page: "true" + annotations: + summary: "Two or more MinIO nodes unreachable" + description: | + EC:2 tolerates 2-drive loss. With 1 drive per node, ≥ 2 nodes + unreachable means we are at-or-past the redundancy ceiling. + Any further failure causes data unavailability. Page now. + runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable" diff --git a/docs/ENV_VARIABLES.md b/docs/ENV_VARIABLES.md index 3d16f491b..d4728827a 100644 --- a/docs/ENV_VARIABLES.md +++ b/docs/ENV_VARIABLES.md @@ -235,12 +235,14 @@ Opt-in. Le path upload principal n'utilise pas encore S3 (FUNCTIONAL_AUDIT §4 i | Variable | Défaut | Lu à | Rôle | | --- | --- | --- | --- | | `AWS_S3_ENABLED` | `false` | `config.go:364` | Master switch. | -| `AWS_S3_BUCKET` | (vide) | `config.go:359` | Nom bucket. | +| `AWS_S3_BUCKET` | (vide) | `config.go:359` | Nom bucket. En prod distribué (v1.0.9 W3 Day 12) : `veza-prod-tracks`. | | `AWS_REGION` | `us-east-1` | `config.go:360` | Région. | -| `AWS_S3_ENDPOINT` | (vide) | `config.go:361` | Endpoint custom (MinIO). | +| `AWS_S3_ENDPOINT` | (vide) | `config.go:361` | Endpoint custom (MinIO). En prod distribué : `http://minio-1.lxd:9000` directement, ou via HAProxy (v1.0.9 W4 day 19). | | `AWS_ACCESS_KEY_ID` | (vide) | `config.go:362` | Optionnel si IAM role EC2. | | `AWS_SECRET_ACCESS_KEY` | (vide) | `config.go:363` | — | +**Migration single-node → distribué (v1.0.9 W3 Day 12)** : `bash scripts/minio-migrate-from-single.sh` mirroir le bucket existant vers le nouveau cluster EC:2 4-nœuds. Voir `infra/ansible/roles/minio_distributed/README.md` pour le déploiement. + ## 13. HLS streaming + track storage backend ### HLS diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml index ef26586fb..4ac357d47 100644 --- a/infra/ansible/inventory/lab.yml +++ b/infra/ansible/inventory/lab.yml @@ -72,6 +72,18 @@ all: # references this group to point each sentinel at it. hosts: redis-1: + # v1.0.9 W3 Day 12: distributed MinIO with EC:2. 4 Incus containers, + # each providing one drive ; single erasure set tolerates 2 simultaneous + # node failures. + minio_nodes: + hosts: + minio-1: + minio-2: + minio-3: + minio-4: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 # v1.0.9 Day 9: otel-collector + Tempo for distributed tracing. # Each runs in its own Incus container; the API on the host points # at otel-collector.lxd:4317 via OTEL_EXPORTER_OTLP_ENDPOINT. diff --git a/infra/ansible/playbooks/minio_distributed.yml b/infra/ansible/playbooks/minio_distributed.yml new file mode 100644 index 000000000..fc19e13cc --- /dev/null +++ b/infra/ansible/playbooks/minio_distributed.yml @@ -0,0 +1,60 @@ +# MinIO distributed playbook — provisions 4 Incus containers +# (minio-1 / minio-2 / minio-3 / minio-4) and lays down the +# distributed cluster with EC:2. +# +# v1.0.9 W3 Day 12. +# +# Run with: +# ansible-galaxy collection install community.general +# ansible-playbook -i inventory/lab.yml playbooks/minio_distributed.yml --check +# ansible-playbook -i inventory/lab.yml playbooks/minio_distributed.yml \ +# --extra-vars '{"minio_root_user":"...","minio_root_password":"..."}' +--- +- name: Provision Incus containers for the MinIO formation + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Launch minio-{1..4} + ansible.builtin.shell: + cmd: | + set -e + for ct in minio-1 minio-2 minio-3 minio-4; do + if ! incus info "$ct" >/dev/null 2>&1; then + incus launch images:ubuntu/22.04 "$ct" + for _ in $(seq 1 30); do + if incus exec "$ct" -- cloud-init status 2>/dev/null | grep -q "status: done"; then + break + fi + sleep 1 + done + incus exec "$ct" -- apt-get update + incus exec "$ct" -- apt-get install -y python3 python3-apt + fi + done + args: + executable: /bin/bash + register: provision_result + changed_when: "'incus launch' in provision_result.stdout" + tags: [minio, provision] + + - name: Refresh inventory so the new containers are reachable + ansible.builtin.meta: refresh_inventory + +- name: Apply common baseline to MinIO containers + hosts: minio_nodes + become: true + gather_facts: true + roles: + - common + +# All 4 nodes need MinIO installed before any one of them can finish +# starting (the binary blocks until the cluster forms quorum). Run +# the role across the group in parallel — Ansible default `forks: 5` +# is enough for 4 hosts. +- name: Install + configure MinIO server on every node + hosts: minio_nodes + become: true + gather_facts: true + roles: + - minio_distributed diff --git a/infra/ansible/roles/minio_distributed/README.md b/infra/ansible/roles/minio_distributed/README.md new file mode 100644 index 000000000..67d9fe0de --- /dev/null +++ b/infra/ansible/roles/minio_distributed/README.md @@ -0,0 +1,118 @@ +# `minio_distributed` role — distributed MinIO with EC:2 + +Four Incus containers, each running one MinIO server. Single erasure set of 4 drives = 2 data + 2 parity. The cluster tolerates **2 simultaneous node failures** without data loss; storage efficiency is 50% (1 GB raw → 500 MB usable). + +## Topology + +``` + S3 API on :9000 + │ + ┌───────────────┼───────────────┐ + │ │ │ + ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ + │ minio-1 │ │ minio-2 │ │ minio-3 │ │ minio-4 │ + │ /data │ │ /data │ │ /data │ │ /data │ + └─────────┘ └─────────┘ └─────────┘ └─────────┘ + └─── single erasure set, EC:2 ───┘ +``` + +Each node also runs the web console on `:9001`. + +## Why EC:2 (not 4 or larger) + +- **Recoverability ceiling.** EC:N tolerates N drive losses. With 4 drives, EC:4 is a 4-way mirror — 25% efficiency, lose-3 OK but with no functional gain over EC:2 in the failure modes we care about (concurrent node losses). +- **Write amplification.** EC:2 writes each object to 4 nodes (2 data + 2 parity). EC:4 would write to all 4 + a copy = 4-way replication. Doubling the wire cost for marginal durability isn't worth it on a 4-node cluster. +- **Future-proofing.** When we go to 6+ nodes (W3+), the natural upgrade is EC:3 across a 6-drive set, NOT growing EC on the same 4 drives. + +## Defaults + +| variable | default | meaning | +| --------------------------------------- | ---------------------------------- | ---------------------------------------------------- | +| `minio_version` | `RELEASE.2025-09-07T16-13-09Z` | matches docker-compose.yml — keep them locked together | +| `minio_port` | `9000` | S3 API | +| `minio_console_port` | `9001` | web console | +| `minio_data_path` | `/var/lib/minio` | drive root on each node | +| `minio_storage_class_standard` | `EC:2` | parity count for STANDARD storage class | +| `minio_bucket_tracks` | `veza-prod-tracks` | prod bucket created on first apply | +| `minio_noncurrent_version_expiry_days` | `30` | delete old object versions after N days | +| `minio_cold_tier_after_days` | `90` | only effective if `minio_remote_tier_name` is set | +| `minio_remote_tier_name` | `""` (none) | future remote tier (Glacier / B2). v1.1 territory. | +| `minio_root_user` / `minio_root_password`| (vault) | root credentials | + +## Vault setup + +```yaml +# group_vars/minio_ha.vault.yml — encrypt with `ansible-vault encrypt` +minio_root_user: "" +minio_root_password: "" +``` + +The role asserts the placeholder values are gone before applying to anything other than `lab`. + +## Backend integration + +**No code change.** The backend's `internal/services/storage/s3*` already speaks the AWS SDK v2 ; pointing it at the new cluster is a config flip : + +```env +AWS_S3_ENABLED=true +AWS_S3_BUCKET=veza-prod-tracks +AWS_S3_ENDPOINT=http://minio-1.lxd:9000 # or behind HAProxy +AWS_S3_REGION=us-east-1 # MinIO default region +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +``` + +For prod, front the 4 nodes with HAProxy (round-robin, health-checked) so the backend sees a single endpoint and tolerates any 1-node loss without DNS edits. HAProxy config lives in `infra/haproxy/` (W4 day 19 ties this in). + +## Migration from single-node + +```bash +# On the old single-node host (or via mc on a workstation) : +mc alias set veza-current http://veza.fr:19000 +mc alias set veza-distributed http://minio-1.lxd:9000 + +# Mirror : preserves versioning, ACLs, content-types. +mc mirror --preserve veza-current/veza-files veza-distributed/veza-prod-tracks + +# Verify count + bytes match before flipping the AWS_S3_ENDPOINT in +# backend env : +mc ls --recursive veza-current/veza-files | wc -l +mc ls --recursive veza-distributed/veza-prod-tracks | wc -l +``` + +The old bucket can be kept hot for ~ 1 week after the flip in case a rollback is needed, then `mc rm --recursive --force --dangerous` drops it. + +## Operations + +```bash +# Cluster health (admin info = info about each drive) : +mc admin info veza-distributed + +# Per-node verbose state : +ssh minio-1 sudo journalctl -u minio -n 100 --no-pager + +# Watch heal progress (after a node was offline / drive replaced) : +mc admin heal veza-distributed --recursive + +# Check lifecycle policy : +mc ilm ls veza-distributed/veza-prod-tracks + +# Console UI (per-node — pick any) : +open http://minio-1.lxd:9001 +``` + +## Failover smoke test + +```bash +MINIO_ROOT_USER=... MINIO_ROOT_PASSWORD=... \ + bash infra/ansible/tests/test_minio_resilience.sh +``` + +Sequence : upload 100 MB random file, kill 2 nodes, assert read still works, restart nodes, wait for self-heal, assert all 4 nodes report healthy. + +## What this role does NOT cover + +- **Cross-DC replication.** Single-host (lab) or single-region in v1.0. v1.1+ adds bucket replication to a remote cluster. +- **Site replication / federation.** Multi-tenant federation is out of scope. +- **Cold tier transitions.** `minio_remote_tier_name` is empty by default — no Glacier / B2 / second-cluster behind the lifecycle yet. Wire when needed. +- **mTLS.** `--tls-cert/key` is W4. The Incus bridge is the security boundary today. diff --git a/infra/ansible/roles/minio_distributed/defaults/main.yml b/infra/ansible/roles/minio_distributed/defaults/main.yml new file mode 100644 index 000000000..424c5d54e --- /dev/null +++ b/infra/ansible/roles/minio_distributed/defaults/main.yml @@ -0,0 +1,48 @@ +# minio_distributed defaults — 4-node distributed MinIO with EC:2 +# (single erasure set: 4 drives = 2 data + 2 parity, tolerates 2 +# simultaneous drive/node losses, 50% storage efficiency). +# +# Pinned to the same release as docker-compose so dev / prod parity +# is preserved (a sneaky bucket-policy regression in a newer MinIO +# release would otherwise show up first in prod). +--- +minio_version: "RELEASE.2025-09-07T16-13-09Z" +minio_arch: amd64 + +minio_port: 9000 # S3 API +minio_console_port: 9001 # Web console +minio_data_path: /var/lib/minio +minio_etc: /etc/minio + +# Erasure-coding parity. With 4 drives in the set the only viable +# values are EC:2 (default) and EC:4 (mirror — wastes capacity). +# EC:2 = 4 drives × 0.5 = 2 drives' worth of data, lose-2 OK. +minio_storage_class_standard: "EC:2" + +# Auth — placeholders. Override via Vault before applying to staging +# or prod. The first task asserts these were overridden. +minio_root_user: "CHANGE_ME_VAULT" +minio_root_password: "CHANGE_ME_VAULT_PASSWORD" + +# Bucket layout — `veza-prod-tracks` is the prod bucket holding +# original audio files + HLS segments. Lifecycle policy lives in +# templates/lifecycle.json.j2. +minio_bucket_tracks: "veza-prod-tracks" + +# Versioning retention for noncurrent versions of objects in the +# tracks bucket. After this many days, expired noncurrent versions +# are deleted. Keeps the bucket from growing unbounded under writers +# that occasionally overwrite (album re-releases, re-encoded HLS). +minio_noncurrent_version_expiry_days: 30 + +# Object age threshold for "cold" tier transition. v1.0 has no +# remote tier configured (no Glacier / B2 backing yet) so this +# directive is a placeholder — the lifecycle.json.j2 template emits +# a *delete-marker expiration* rule by default and only emits a +# transition rule if `minio_remote_tier_name` is non-empty. +minio_cold_tier_after_days: 90 +minio_remote_tier_name: "" # e.g. "GLACIER" once a remote tier is wired + +# `mc` (MinIO client) version used by the init task to create the +# bucket + apply lifecycle. Pinned to the release matching the server. +minio_mc_version: "RELEASE.2025-09-07T05-25-40Z" diff --git a/infra/ansible/roles/minio_distributed/handlers/main.yml b/infra/ansible/roles/minio_distributed/handlers/main.yml new file mode 100644 index 000000000..a27bfd97e --- /dev/null +++ b/infra/ansible/roles/minio_distributed/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart minio + ansible.builtin.systemd: + name: minio + state: restarted + daemon_reload: true diff --git a/infra/ansible/roles/minio_distributed/tasks/main.yml b/infra/ansible/roles/minio_distributed/tasks/main.yml new file mode 100644 index 000000000..981b8737d --- /dev/null +++ b/infra/ansible/roles/minio_distributed/tasks/main.yml @@ -0,0 +1,178 @@ +# minio_distributed role — installs MinIO server (versioned), drops +# the systemd unit pointing at all 4 nodes via MINIO_VOLUMES, starts +# the cluster. Idempotent. +# +# After every node converges, a one-shot init task on the FIRST node +# in `minio_nodes` creates the prod bucket + applies versioning + +# lifecycle. Running it on a single node is sufficient — MinIO +# replicates bucket metadata across the erasure set. +--- +- name: Vault placeholders are overridden in non-lab envs + ansible.builtin.assert: + that: + - minio_root_user != "CHANGE_ME_VAULT" + - minio_root_password != "CHANGE_ME_VAULT_PASSWORD" + fail_msg: | + minio_root_user / minio_root_password still hold placeholder + values. Provide them via group_vars/minio_ha.vault.yml (encrypted) + before applying this role to staging or prod. + when: (deploy_env | default("lab")) != "lab" + tags: [minio, assert] + +- name: Ensure minio user + ansible.builtin.user: + name: minio + system: true + home: "{{ minio_data_path }}" + shell: /usr/sbin/nologin + create_home: true + tags: [minio, install] + +- name: Ensure data + config directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: minio + group: minio + mode: "0750" + loop: + - "{{ minio_data_path }}" + - "{{ minio_etc }}" + tags: [minio, install] + +- name: Check installed MinIO version + ansible.builtin.stat: + path: "/usr/local/bin/minio-{{ minio_version }}" + register: minio_installed + tags: [minio, install] + +- name: Download MinIO server binary (versioned) + ansible.builtin.get_url: + url: "https://dl.min.io/server/minio/release/linux-{{ minio_arch }}/archive/minio.{{ minio_version }}" + dest: "/usr/local/bin/minio-{{ minio_version }}" + mode: "0755" + owner: root + group: root + when: not minio_installed.stat.exists + tags: [minio, install] + +- name: Symlink /usr/local/bin/minio → versioned binary + ansible.builtin.file: + src: "/usr/local/bin/minio-{{ minio_version }}" + dest: /usr/local/bin/minio + state: link + force: true + notify: Restart minio + tags: [minio, install] + +- name: Check installed mc client version + ansible.builtin.stat: + path: "/usr/local/bin/mc-{{ minio_mc_version }}" + register: mc_installed + tags: [minio, install] + +- name: Download mc client (versioned, used by bucket init task) + ansible.builtin.get_url: + url: "https://dl.min.io/client/mc/release/linux-{{ minio_arch }}/archive/mc.{{ minio_mc_version }}" + dest: "/usr/local/bin/mc-{{ minio_mc_version }}" + mode: "0755" + owner: root + group: root + when: not mc_installed.stat.exists + tags: [minio, install] + +- name: Symlink /usr/local/bin/mc → versioned binary + ansible.builtin.file: + src: "/usr/local/bin/mc-{{ minio_mc_version }}" + dest: /usr/local/bin/mc + state: link + force: true + tags: [minio, install] + +- name: Render /etc/default/minio + ansible.builtin.template: + src: minio.env.j2 + dest: /etc/default/minio + owner: root + group: minio + mode: "0640" + notify: Restart minio + tags: [minio, config] + +- name: Render systemd unit + ansible.builtin.template: + src: minio.service.j2 + dest: /etc/systemd/system/minio.service + owner: root + group: root + mode: "0644" + notify: Restart minio + tags: [minio, service] + +- name: Enable + start minio + ansible.builtin.systemd: + name: minio + state: started + enabled: true + daemon_reload: true + tags: [minio, service] + +# ----------------------------------------------------------------------- +# Bucket + lifecycle init — runs once, on the first node only. The +# erasure-coded cluster syncs metadata across nodes so we don't need +# to repeat this everywhere. +# ----------------------------------------------------------------------- +- name: Wait for MinIO API to accept connections (every node) + ansible.builtin.wait_for: + host: "{{ ansible_default_ipv4.address | default('127.0.0.1') }}" + port: "{{ minio_port }}" + timeout: 60 + tags: [minio, init] + +- name: Render lifecycle policy + ansible.builtin.template: + src: lifecycle.json.j2 + dest: "{{ minio_etc }}/lifecycle.json" + owner: root + group: minio + mode: "0640" + when: inventory_hostname == groups['minio_nodes'][0] + tags: [minio, init] + +- name: Configure mc alias for the local cluster + ansible.builtin.command: + cmd: >- + /usr/local/bin/mc alias set veza-local + http://localhost:{{ minio_port }} + {{ minio_root_user }} {{ minio_root_password }} + changed_when: false + no_log: true + when: inventory_hostname == groups['minio_nodes'][0] + tags: [minio, init] + +- name: Create the prod bucket if it doesn't exist + ansible.builtin.command: + cmd: /usr/local/bin/mc mb --ignore-existing veza-local/{{ minio_bucket_tracks }} + register: mc_mb + changed_when: "'Bucket created successfully' in mc_mb.stdout" + when: inventory_hostname == groups['minio_nodes'][0] + tags: [minio, init] + +- name: Enable versioning on the prod bucket + ansible.builtin.command: + cmd: /usr/local/bin/mc version enable veza-local/{{ minio_bucket_tracks }} + changed_when: false + when: inventory_hostname == groups['minio_nodes'][0] + tags: [minio, init] + +- name: Apply lifecycle policy + ansible.builtin.command: + cmd: >- + /usr/local/bin/mc ilm import + veza-local/{{ minio_bucket_tracks }} + < {{ minio_etc }}/lifecycle.json + args: + executable: /bin/bash + changed_when: false + when: inventory_hostname == groups['minio_nodes'][0] + tags: [minio, init] diff --git a/infra/ansible/roles/minio_distributed/templates/lifecycle.json.j2 b/infra/ansible/roles/minio_distributed/templates/lifecycle.json.j2 new file mode 100644 index 000000000..d0e64fe18 --- /dev/null +++ b/infra/ansible/roles/minio_distributed/templates/lifecycle.json.j2 @@ -0,0 +1,29 @@ +{ + "Rules": [ + { + "ID": "expire-noncurrent-versions", + "Status": "Enabled", + "Filter": {}, + "NoncurrentVersionExpiration": { + "NoncurrentDays": {{ minio_noncurrent_version_expiry_days }} + } + }, + { + "ID": "abort-multipart-uploads", + "Status": "Enabled", + "Filter": {}, + "AbortIncompleteMultipartUpload": { + "DaysAfterInitiation": 7 + } + }{% if minio_remote_tier_name %}, + { + "ID": "transition-cold-tier", + "Status": "Enabled", + "Filter": {}, + "Transition": { + "Days": {{ minio_cold_tier_after_days }}, + "StorageClass": "{{ minio_remote_tier_name }}" + } + }{% endif %} + ] +} diff --git a/infra/ansible/roles/minio_distributed/templates/minio.env.j2 b/infra/ansible/roles/minio_distributed/templates/minio.env.j2 new file mode 100644 index 000000000..4f7bf6cc5 --- /dev/null +++ b/infra/ansible/roles/minio_distributed/templates/minio.env.j2 @@ -0,0 +1,28 @@ +# Managed by Ansible — do not edit by hand. +# Distributed MinIO env file. Same values on every node — MinIO uses +# MINIO_VOLUMES (set on the systemd unit ExecStart line via this env) +# to discover the cluster topology. + +MINIO_ROOT_USER={{ minio_root_user }} +MINIO_ROOT_PASSWORD={{ minio_root_password }} + +# Cluster topology — bracket-expansion form. MinIO expands +# minio-{1...4}.lxd into the 4 hostnames + dials each on port 9000. +# Single drive per node = single erasure set of 4 drives. +MINIO_VOLUMES="http://minio-{1...{{ groups['minio_nodes'] | length }}}.lxd:{{ minio_port }}{{ minio_data_path }}" + +# Force EC:2 on the standard storage class. Without this, MinIO +# auto-picks based on drive count ; pinning makes the policy explicit. +MINIO_STORAGE_CLASS_STANDARD={{ minio_storage_class_standard }} + +# Console UI binds on a separate port so the firewall can isolate it +# from public S3 traffic. Behind a reverse proxy in prod. +MINIO_OPTS="--console-address :{{ minio_console_port }}" + +# Prometheus metrics — enabled with bearer auth disabled for the +# local Incus bridge. mTLS is W4 territory. +MINIO_PROMETHEUS_AUTH_TYPE=public + +# Browser banner — shows in the console so operators know which +# instance they're poking at. +MINIO_BROWSER_REDIRECT_URL=http://{{ ansible_hostname }}.lxd:{{ minio_console_port }} diff --git a/infra/ansible/roles/minio_distributed/templates/minio.service.j2 b/infra/ansible/roles/minio_distributed/templates/minio.service.j2 new file mode 100644 index 000000000..e23698ebe --- /dev/null +++ b/infra/ansible/roles/minio_distributed/templates/minio.service.j2 @@ -0,0 +1,34 @@ +# Managed by Ansible — do not edit by hand. +[Unit] +Description=MinIO Distributed (EC:2 across {{ groups['minio_nodes'] | length }} nodes) +Documentation=https://min.io/docs/minio/linux/index.html +After=network-online.target +Wants=network-online.target +AssertFileIsExecutable=/usr/local/bin/minio + +[Service] +Type=notify +User=minio +Group=minio +EnvironmentFile=/etc/default/minio +# $MINIO_VOLUMES + $MINIO_OPTS are read by the binary itself; the +# `server` subcommand only needs the volumes path argument. +ExecStart=/usr/local/bin/minio server $MINIO_OPTS $MINIO_VOLUMES +Restart=always +RestartSec=5s +LimitNOFILE=1048576 +TimeoutStopSec=infinity +SendSIGKILL=no + +# Hardening — same baseline as the other Ansible-managed daemons. +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ minio_data_path }} +PrivateTmp=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +[Install] +WantedBy=multi-user.target diff --git a/infra/ansible/tests/test_minio_resilience.sh b/infra/ansible/tests/test_minio_resilience.sh new file mode 100755 index 000000000..969ce7e33 --- /dev/null +++ b/infra/ansible/tests/test_minio_resilience.sh @@ -0,0 +1,159 @@ +#!/usr/bin/env bash +# test_minio_resilience.sh — validate distributed MinIO survives 2 +# simultaneous node losses (EC:2 acceptance criterion). +# +# Sequence : +# 1. upload a 100 MB random file to veza-prod-tracks +# 2. stop 2 of the 4 minio containers +# 3. read the file back through a surviving node — must succeed +# 4. restart the stopped containers +# 5. wait for self-heal +# 6. assert all 4 nodes report healthy +# +# v1.0.9 W3 Day 12 — acceptance for ROADMAP_V1.0_LAUNCH.md §Semaine 3 +# day 12: "EC4+2 résiste à 2 nœud kills, dashboard MinIO healthcheck vert". +# +# Usage: +# MINIO_ROOT_USER=... MINIO_ROOT_PASSWORD=... \ +# bash infra/ansible/tests/test_minio_resilience.sh +# +# Exit codes: +# 0 — survived 2-node loss, self-heal completed +# 1 — cluster not healthy at start +# 2 — read failed during 2-node loss (EC:2 didn't deliver) +# 3 — required tool missing +# 4 — self-heal didn't complete within timeout +set -euo pipefail + +CONTAINERS=(minio-1 minio-2 minio-3 minio-4) +KILL=(${KILL_NODES:-minio-2 minio-3}) +BUCKET=${BUCKET:-veza-prod-tracks} +TEST_OBJECT_SIZE_MB=${TEST_OBJECT_SIZE_MB:-100} +HEAL_TIMEOUT_SECONDS=${HEAL_TIMEOUT_SECONDS:-300} +MINIO_ROOT_USER=${MINIO_ROOT_USER:-?} +MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-?} + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +fail() { log "FAIL: $*"; exit "${2:-2}"; } + +require() { + command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3 +} + +require incus +require date +require dd + +if [ "$MINIO_ROOT_USER" = "?" ] || [ "$MINIO_ROOT_PASSWORD" = "?" ]; then + fail "MINIO_ROOT_USER and MINIO_ROOT_PASSWORD env vars are required" 3 +fi + +# Helper : run mc inside a chosen surviving container so we don't +# need mc on the host. Each container has /usr/local/bin/mc. +mc_in() { + local ct=$1; shift + incus exec "$ct" -- /usr/local/bin/mc "$@" +} + +# Helper : (re-)set the alias on the chosen container. +mc_alias() { + local ct=$1 + mc_in "$ct" alias set veza-local "http://localhost:9000" \ + "$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD" >/dev/null +} + +# ----------------------------------------------------------------------------- +# 0. Sanity — cluster healthy at start. +# ----------------------------------------------------------------------------- +log "step 0: pre-flight — cluster health on minio-1" +mc_alias minio-1 +admin_info=$(mc_in minio-1 admin info veza-local 2>&1 || true) +log "admin info:" +echo "$admin_info" | sed 's/^/ /' >&2 + +if echo "$admin_info" | grep -qiE "offline|unreachable"; then + fail "cluster reports offline drives at start — refusing to test from a degraded baseline" 1 +fi + +# ----------------------------------------------------------------------------- +# 1. Upload test object. +# ----------------------------------------------------------------------------- +log "step 1: generating + uploading ${TEST_OBJECT_SIZE_MB}MB test object" +incus exec minio-1 -- bash -c "dd if=/dev/urandom of=/tmp/resilience-test.bin bs=1M count=${TEST_OBJECT_SIZE_MB} status=none" +src_sha=$(incus exec minio-1 -- sha256sum /tmp/resilience-test.bin | awk '{print $1}') +mc_in minio-1 cp /tmp/resilience-test.bin "veza-local/${BUCKET}/resilience-test.bin" +log "source SHA-256: $src_sha" + +# ----------------------------------------------------------------------------- +# 2. Stop 2 nodes — simulate concurrent failures. +# ----------------------------------------------------------------------------- +log "step 2: stopping ${KILL[*]} (concurrent failure simulation)" +for ct in "${KILL[@]}"; do + incus stop --force "$ct" & +done +wait + +# ----------------------------------------------------------------------------- +# 3. Read back through a surviving node — EC:2 must reconstruct. +# ----------------------------------------------------------------------------- +survivors=() +for ct in "${CONTAINERS[@]}"; do + for k in "${KILL[@]}"; do [ "$ct" = "$k" ] && continue 2; done + survivors+=("$ct") +done +read_via=${survivors[0]} + +log "step 3: reading back via $read_via (EC:2 should reconstruct)" +mc_alias "$read_via" +sleep 5 # give MinIO a moment to mark the killed nodes offline +if ! mc_in "$read_via" cp "veza-local/${BUCKET}/resilience-test.bin" /tmp/resilience-readback.bin; then + fail "read failed during 2-node loss — EC:2 did not deliver the redundancy promise" 2 +fi +read_sha=$(incus exec "$read_via" -- sha256sum /tmp/resilience-readback.bin | awk '{print $1}') + +if [ "$src_sha" != "$read_sha" ]; then + fail "checksum mismatch: source=$src_sha read=$read_sha — silent corruption during reconstruction" 2 +fi +log "checksum matches under degraded mode" + +# ----------------------------------------------------------------------------- +# 4. Restart the stopped nodes. +# ----------------------------------------------------------------------------- +log "step 4: restarting ${KILL[*]}" +for ct in "${KILL[@]}"; do + incus start "$ct" & +done +wait + +# ----------------------------------------------------------------------------- +# 5. Wait for self-heal. +# ----------------------------------------------------------------------------- +log "step 5: waiting for self-heal (timeout ${HEAL_TIMEOUT_SECONDS}s)" +deadline=$(( $(date +%s) + HEAL_TIMEOUT_SECONDS )) +healed=0 +while [ "$(date +%s)" -lt "$deadline" ]; do + mc_alias minio-1 || { sleep 5; continue; } + info=$(mc_in minio-1 admin info veza-local 2>&1 || true) + if ! echo "$info" | grep -qiE "offline|unreachable" && \ + echo "$info" | grep -qE "[Oo]nline.*(4|four)"; then + healed=1 + break + fi + sleep 5 +done + +if [ "$healed" -ne 1 ]; then + log "final admin info:" + mc_in minio-1 admin info veza-local 2>&1 | sed 's/^/ /' >&2 || true + fail "self-heal did not complete within ${HEAL_TIMEOUT_SECONDS}s" 4 +fi + +# ----------------------------------------------------------------------------- +# 6. Cleanup. +# ----------------------------------------------------------------------------- +log "step 6: cleanup test object" +mc_in minio-1 rm "veza-local/${BUCKET}/resilience-test.bin" || true +incus exec minio-1 -- rm -f /tmp/resilience-test.bin /tmp/resilience-readback.bin || true + +log "PASS: cluster survived ${#KILL[@]}-node loss + self-healed within budget" +exit 0 diff --git a/scripts/minio-migrate-from-single.sh b/scripts/minio-migrate-from-single.sh new file mode 100755 index 000000000..d4570d97b --- /dev/null +++ b/scripts/minio-migrate-from-single.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# minio-migrate-from-single.sh — copy objects from the single-node +# MinIO bucket (used through v1.0.8) into the new distributed cluster. +# +# v1.0.9 W3 Day 12. The migration is one-way and idempotent — +# `mc mirror --preserve` skips objects that already exist on the +# destination with the same etag, so re-runs are safe + cheap. +# +# Required env : +# SOURCE_ENDPOINT e.g. http://veza.fr:19000 +# SOURCE_ACCESS_KEY minio root user on the old cluster +# SOURCE_SECRET_KEY minio root password on the old cluster +# SOURCE_BUCKET e.g. veza-files (the dev bucket name) +# DEST_ENDPOINT e.g. http://minio-1.lxd:9000 +# DEST_ACCESS_KEY root user on the new distributed cluster +# DEST_SECRET_KEY root password on the new distributed cluster +# DEST_BUCKET e.g. veza-prod-tracks +# +# Optional : +# DRY_RUN=1 print what would be copied, don't actually copy +# +# Exit codes : +# 0 — mirror complete + counts match +# 1 — required env missing +# 2 — source or dest cluster unreachable +# 3 — count mismatch after mirror (something silently dropped) +set -euo pipefail + +require_env() { + local v=$1 + if [ -z "${!v:-}" ]; then + echo "FAIL: required env var $v is not set" >&2 + exit 1 + fi +} + +require_env SOURCE_ENDPOINT +require_env SOURCE_ACCESS_KEY +require_env SOURCE_SECRET_KEY +require_env SOURCE_BUCKET +require_env DEST_ENDPOINT +require_env DEST_ACCESS_KEY +require_env DEST_SECRET_KEY +require_env DEST_BUCKET + +if ! command -v mc >/dev/null 2>&1; then + echo "FAIL: mc (MinIO client) not in PATH. Install with:" >&2 + echo " curl -fsSL https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/local/bin/mc && chmod +x /usr/local/bin/mc" >&2 + exit 1 +fi + +DRY_RUN=${DRY_RUN:-0} + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +log "Setting mc aliases" +mc alias set veza-source "$SOURCE_ENDPOINT" "$SOURCE_ACCESS_KEY" "$SOURCE_SECRET_KEY" >/dev/null +mc alias set veza-dest "$DEST_ENDPOINT" "$DEST_ACCESS_KEY" "$DEST_SECRET_KEY" >/dev/null + +# Sanity — both endpoints reachable. +if ! mc admin info veza-source >/dev/null 2>&1; then + echo "FAIL: source cluster $SOURCE_ENDPOINT not reachable" >&2 + exit 2 +fi +if ! mc admin info veza-dest >/dev/null 2>&1; then + echo "FAIL: destination cluster $DEST_ENDPOINT not reachable" >&2 + exit 2 +fi + +log "Source bucket count :" +src_count=$(mc ls --recursive "veza-source/$SOURCE_BUCKET" 2>/dev/null | wc -l) +log " $src_count objects in $SOURCE_BUCKET" + +log "Destination bucket count (before) :" +dest_count_before=$(mc ls --recursive "veza-dest/$DEST_BUCKET" 2>/dev/null | wc -l || echo 0) +log " $dest_count_before objects in $DEST_BUCKET" + +if [ "$DRY_RUN" = "1" ]; then + log "DRY_RUN=1 — running mirror with --dry-run flag" + mc mirror --preserve --dry-run "veza-source/$SOURCE_BUCKET" "veza-dest/$DEST_BUCKET" + exit 0 +fi + +log "Mirroring (this will take time proportional to bucket size)" +mc mirror --preserve "veza-source/$SOURCE_BUCKET" "veza-dest/$DEST_BUCKET" + +log "Verifying object count after mirror" +dest_count_after=$(mc ls --recursive "veza-dest/$DEST_BUCKET" 2>/dev/null | wc -l) +log " $dest_count_after objects in $DEST_BUCKET (was $dest_count_before before)" + +if [ "$dest_count_after" -lt "$src_count" ]; then + echo "FAIL: destination has fewer objects than source ($dest_count_after < $src_count). Mirror is incomplete." >&2 + exit 3 +fi + +log "PASS: mirror complete. Object counts match (src=$src_count dest=$dest_count_after)." +log "" +log "Next steps :" +log " 1. Update backend .env on every API host :" +log " AWS_S3_ENDPOINT=$DEST_ENDPOINT" +log " AWS_S3_BUCKET=$DEST_BUCKET" +log " AWS_ACCESS_KEY_ID=$DEST_ACCESS_KEY" +log " AWS_SECRET_ACCESS_KEY=" +log " 2. Rolling restart of the API tier." +log " 3. Smoke-test : POST /api/v1/tracks (chunked upload), GET /tracks/:id/stream." +log " 4. Keep the old cluster hot for ~ 1 week before decommissioning."