diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg index b5e626651..b11cde973 100644 --- a/infra/ansible/ansible.cfg +++ b/infra/ansible/ansible.cfg @@ -21,4 +21,10 @@ nocows = 1 # host per playbook run. Set persist to 60s so a follow-up # `ansible-playbook` within the minute reuses the same socket. ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ServerAliveInterval=15 -pipelining = True +# pipelining=True breaks --ask-become-pass when the remote sudo expects +# a TTY-driven prompt — ansible can't deliver the password through a +# pipe in that mode. Setting it to False is ~5% slower per task but +# makes interactive sudo (no NOPASSWD) work reliably. We DO NOT want +# NOPASSWD sudo on the R720 ; it expands the blast radius of any +# compromise of the operator's account. +pipelining = False diff --git a/infra/ansible/inventory/local.yml b/infra/ansible/inventory/local.yml new file mode 100644 index 000000000..17b920352 --- /dev/null +++ b/infra/ansible/inventory/local.yml @@ -0,0 +1,129 @@ +# Local inventory — run ansible-playbook directly on the R720 (no SSH, +# no --ask-become-pass needed because the operator is already root via +# `sudo` when invoking the script). Useful when : +# * The operator's laptop can't reach the R720 (no WireGuard yet) +# * Disaster recovery — work directly on the host +# * Faster iteration during initial bootstrap +# +# Same shape as staging.yml but : +# * incus_hosts / staging hosts → localhost (connection: local) +# * Container groups (haproxy, veza_app_*, veza_data, forgejo_runner) +# keep using community.general.incus — the connection just goes +# through the LOCAL incus binary, not over SSH. +# +# Usage : +# sudo ansible-playbook -i inventory/local.yml playbooks/bootstrap_runner.yml \ +# --vault-password-file /path/to/.vault-pass \ +# -e forgejo_registration_token=$TOKEN \ +# -e forgejo_api_url=https://10.0.20.105:3000 +all: + hosts: + localhost: + ansible_connection: local + ansible_python_interpreter: /usr/bin/python3 + vars: + # Default to the env=staging shape ; override with -e veza_env=prod + # if running prod-specific tasks locally. + veza_env: staging + veza_container_prefix: "veza-staging-" + veza_incus_dns_suffix: lxd + haproxy_topology: blue-green + veza_public_host: staging.veza.fr + veza_public_url: "https://staging.veza.fr" + veza_cors_allowed_origins: + - "https://staging.veza.fr" + veza_log_level: DEBUG + veza_otel_sample_rate: "1.0" + veza_feature_flags: + HYPERSWITCH_ENABLED: "false" + STRIPE_CONNECT_ENABLED: "false" + WEBAUTHN_ENABLED: "true" + veza_release_retention: 30 + postgres_password: "{{ vault_postgres_password }}" + redis_password: "{{ vault_redis_password }}" + rabbitmq_password: "{{ vault_rabbitmq_password }}" + veza_incus_network: net-veza + veza_incus_subnet: 10.0.20.0/24 + children: + incus_hosts: + hosts: + localhost: + staging: + hosts: + localhost: + forgejo_runner: + hosts: + forgejo-runner: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + haproxy: + hosts: + veza-haproxy: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_app_backend: + children: + veza_app_backend_blue: + veza_app_backend_green: + veza_app_backend_tools: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_app_backend_blue: + hosts: + veza-staging-backend-blue: + veza_app_backend_green: + hosts: + veza-staging-backend-green: + veza_app_backend_tools: + hosts: + veza-staging-backend-tools: + veza_app_stream: + children: + veza_app_stream_blue: + veza_app_stream_green: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_app_stream_blue: + hosts: + veza-staging-stream-blue: + veza_app_stream_green: + hosts: + veza-staging-stream-green: + veza_app_web: + children: + veza_app_web_blue: + veza_app_web_green: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_app_web_blue: + hosts: + veza-staging-web-blue: + veza_app_web_green: + hosts: + veza-staging-web-green: + veza_data: + children: + veza_data_postgres: + veza_data_redis: + veza_data_rabbitmq: + veza_data_minio: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_data_postgres: + hosts: + veza-staging-postgres: + veza_data_redis: + hosts: + veza-staging-redis: + veza_data_rabbitmq: + hosts: + veza-staging-rabbitmq: + veza_data_minio: + hosts: + veza-staging-minio: diff --git a/infra/ansible/inventory/prod.yml b/infra/ansible/inventory/prod.yml index b79693373..ca7651ee5 100644 --- a/infra/ansible/inventory/prod.yml +++ b/infra/ansible/inventory/prod.yml @@ -23,6 +23,13 @@ all: incus_hosts: hosts: veza-prod: + # forgejo-runner container (target of bootstrap_runner.yml phase 3). + forgejo_runner: + hosts: + forgejo-runner: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 # SHARED edge — one HAProxy on the R720 public 443. Serves # staging + prod + forgejo.talas.group simultaneously. Same # container in both staging.yml and prod.yml inventories. diff --git a/infra/ansible/inventory/staging.yml b/infra/ansible/inventory/staging.yml index 2a034ccfa..e5bc94996 100644 --- a/infra/ansible/inventory/staging.yml +++ b/infra/ansible/inventory/staging.yml @@ -39,6 +39,15 @@ all: incus_hosts: hosts: veza-staging: + # forgejo-runner container (target of bootstrap_runner.yml phase 3, + # reached via the community.general.incus connection plugin from + # the Incus host). + forgejo_runner: + hosts: + forgejo-runner: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 # SHARED edge — one HAProxy on the R720 public 443. Serves # staging + prod + forgejo.talas.group simultaneously, Host-based # routing per env. NAME deliberately env-agnostic (no veza-staging- diff --git a/infra/ansible/playbooks/bootstrap_runner.yml b/infra/ansible/playbooks/bootstrap_runner.yml new file mode 100644 index 000000000..e93da7155 --- /dev/null +++ b/infra/ansible/playbooks/bootstrap_runner.yml @@ -0,0 +1,232 @@ +# bootstrap_runner.yml — one-time setup of the deploy pipeline's +# infrastructure on the Incus host. Runs BEFORE haproxy.yml / +# deploy_data.yml / deploy_app.yml the first time, then idempotent +# on every subsequent run. +# +# Phases (each idempotent, each guarded by a state check) : +# 1. Incus profiles (veza-app, veza-data ; drop legacy veza-net) +# 2. forgejo-runner has Incus socket + nesting + incus binary +# 3. forgejo-runner registered with the `incus` label +# +# Required extra-vars (passed by the wrapping bootstrap script) : +# forgejo_registration_token short-lived runner registration token +# (fetched from Forgejo API by the wrapper) +# forgejo_api_url full URL to the Forgejo instance +# (used inside the runner container) +# +# Usage from the operator's laptop : +# ansible-playbook -i inventory/staging.yml playbooks/bootstrap_runner.yml \ +# --ask-become-pass \ +# --vault-password-file .vault-pass \ +# -e forgejo_registration_token=$TOKEN \ +# -e forgejo_api_url=https://10.0.20.105:3000 +# +# Usage directly on the R720 : +# ansible-playbook -i inventory/local.yml playbooks/bootstrap_runner.yml \ +# --vault-password-file /etc/talas/vault-pass \ +# -e forgejo_registration_token=$TOKEN \ +# -e forgejo_api_url=https://10.0.20.105:3000 +--- +- name: Validate inputs + hosts: incus_hosts + become: true + gather_facts: false + tasks: + - name: Assert required extra-vars + ansible.builtin.assert: + that: + - forgejo_registration_token is defined + - forgejo_registration_token | length > 10 + - forgejo_api_url is defined + - forgejo_api_url | length > 0 + fail_msg: >- + bootstrap_runner.yml requires forgejo_registration_token + (fetched from $FORGEJO_API/api/v1/repos/$OWNER/$REPO/actions/runners/registration-token) + and forgejo_api_url (e.g. https://10.0.20.105:3000) ; + pass them via -e on the command line. + quiet: true + +# ===================================================================== +# Phase 1 — Incus profiles +# ===================================================================== +- name: Phase 1 — Incus profiles + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Ensure veza-{app,data} profiles exist (empty by default) + ansible.builtin.command: incus profile create {{ item }} + register: profile_create + failed_when: profile_create.rc != 0 and 'already exists' not in profile_create.stderr + changed_when: profile_create.rc == 0 + loop: + - veza-app + - veza-data + + - name: Detect legacy empty veza-net profile + ansible.builtin.command: incus profile show veza-net + register: vnet_show + failed_when: false + changed_when: false + + - name: Drop legacy veza-net profile if it exists and has no devices + ansible.builtin.command: incus profile delete veza-net + when: + - vnet_show.rc == 0 + - "'devices: {}' in vnet_show.stdout" + changed_when: true + +# ===================================================================== +# Phase 2 — forgejo-runner gets Incus socket + nesting + binary +# ===================================================================== +- name: Phase 2 — forgejo-runner Incus access + hosts: incus_hosts + become: true + gather_facts: false + tasks: + - name: Verify forgejo-runner container exists + ansible.builtin.command: incus info forgejo-runner + register: runner_info + failed_when: runner_info.rc != 0 + changed_when: false + + - name: Check if incus-socket device is already attached + ansible.builtin.shell: | + incus config device show forgejo-runner | grep -q '^incus-socket:' + register: socket_attached + failed_when: false + changed_when: false + + - name: Attach /var/lib/incus/unix.socket as a disk device + ansible.builtin.command: >- + incus config device add forgejo-runner incus-socket disk + source=/var/lib/incus/unix.socket + path=/var/lib/incus/unix.socket + when: socket_attached.rc != 0 + register: device_attached + + - name: Read current security.nesting setting + ansible.builtin.command: incus config get forgejo-runner security.nesting + register: nesting_val + changed_when: false + + - name: Enable security.nesting=true + ansible.builtin.command: incus config set forgejo-runner security.nesting=true + when: nesting_val.stdout | trim != "true" + register: nesting_set + + - name: Restart forgejo-runner if device or nesting changed + ansible.builtin.command: incus restart forgejo-runner + when: + - device_attached.changed | default(false) or nesting_set.changed | default(false) + + - name: Wait for forgejo-runner to be reachable after restart + ansible.builtin.command: incus exec forgejo-runner -- /bin/true + register: runner_ready + until: runner_ready.rc == 0 + retries: 30 + delay: 1 + changed_when: false + + - name: Check whether incus binary is already in the runner + ansible.builtin.command: incus exec forgejo-runner -- test -x /usr/local/bin/incus + register: binary_present + failed_when: false + changed_when: false + + - name: Push host's /usr/bin/incus into runner:/usr/local/bin/incus + ansible.builtin.command: >- + incus file push /usr/bin/incus + forgejo-runner/usr/local/bin/incus + --mode 0755 + when: binary_present.rc != 0 + + - name: Smoke-test runner can reach Incus socket + ansible.builtin.command: incus exec forgejo-runner -- /usr/local/bin/incus list + register: smoketest + failed_when: false + changed_when: false + + - name: Warn if smoke-test failed (non-fatal — depends on runner user perms) + ansible.builtin.debug: + msg: >- + forgejo-runner cannot list Incus from its default user (rc={{ smoketest.rc }}). + This is OK if the systemd unit runs as root inside the container ; + if not, the runner user needs gid alignment with the host's incus-admin group. + when: smoketest.rc != 0 + +# ===================================================================== +# Phase 3 — forgejo-runner registered with `incus` label +# ===================================================================== +- name: Phase 3 — forgejo-runner labels (run inside the runner container) + hosts: forgejo_runner + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + tasks: + - name: Locate the runner config file + ansible.builtin.shell: | + for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner; do + [ -f "$f" ] && echo "$f" && exit 0 + done + exit 1 + register: runner_cfg_path + failed_when: false + changed_when: false + + - name: Read existing labels (if any) + ansible.builtin.shell: | + jq -r '.labels[]?' "{{ runner_cfg_path.stdout }}" 2>/dev/null \ + || grep -oE '"labels":\[[^]]+\]' "{{ runner_cfg_path.stdout }}" 2>/dev/null \ + || echo "" + register: existing_labels + when: runner_cfg_path.rc == 0 + changed_when: false + + - name: Skip if 'incus' label is already present + ansible.builtin.meta: end_play + when: + - runner_cfg_path.rc == 0 + - existing_labels.stdout is defined + - "'incus' in existing_labels.stdout" + + - name: Detect runner binary (forgejo-runner or act_runner) + ansible.builtin.shell: | + for b in forgejo-runner act_runner; do + command -v "$b" >/dev/null 2>&1 && echo "$b" && exit 0 + done + exit 1 + register: runner_bin + changed_when: false + failed_when: runner_bin.rc != 0 + + - name: Stop the runner systemd unit + ansible.builtin.systemd: + name: "{{ runner_bin.stdout }}.service" + state: stopped + register: stop_unit + failed_when: false + + - name: Remove old .runner config to force re-registration + ansible.builtin.file: + path: "{{ runner_cfg_path.stdout }}" + state: absent + when: runner_cfg_path.rc == 0 + + - name: Re-register runner with --labels incus,self-hosted + ansible.builtin.command: >- + {{ runner_bin.stdout }} register + --no-interactive + --instance {{ forgejo_api_url }} + --token {{ forgejo_registration_token }} + --name r720-incus + --labels incus,self-hosted + no_log: true # token is sensitive + + - name: Start the runner systemd unit + ansible.builtin.systemd: + name: "{{ runner_bin.stdout }}.service" + state: started + enabled: true diff --git a/scripts/bootstrap/README.md b/scripts/bootstrap/README.md index 2d8ed359f..f0c30c429 100644 --- a/scripts/bootstrap/README.md +++ b/scripts/bootstrap/README.md @@ -1,125 +1,150 @@ -# `scripts/bootstrap/` +# `scripts/bootstrap/` — bootstrap the Veza deploy pipeline -Two-host bootstrap of the Veza deploy pipeline. Each script is -idempotent, resumable, and read-only by default unless explicitly -asked to mutate. +Two parallel scripts (one per host) + four helpers + one shared lib. +Each script is **idempotent**, **resumable**, **read-only by default** +unless explicitly asked to mutate. **No NOPASSWD sudo required**. + +The heavy lifting (Incus profiles, forgejo-runner config, HAProxy +edge, Let's Encrypt) is done by **Ansible playbooks**, not bash. +The shell scripts are thin orchestrators that handle the +chicken-and-egg part : create the Vault that Ansible needs, set +the Forgejo CI secrets, then call `ansible-playbook`. ## Files | File | Where it runs | What it does | |---|---|---| -| `lib.sh` | sourced by all | logging, error trap, idempotent state file, Forgejo API helpers (honours `FORGEJO_INSECURE=1`) | -| `bootstrap-local.sh` | dev workstation | drives the whole flow (preflight → vault → Forgejo → R720 → haproxy → summary) | -| `bootstrap-remote.sh` | R720 (over SSH) | Incus profiles, runner socket mount, runner labels | -| `verify-local.sh` | dev workstation | read-only checks of local state | -| `verify-remote.sh` | R720 | read-only checks of R720 state (run via `verify-remote-ssh.sh`) | -| `verify-remote-ssh.sh` | dev workstation | scp+ssh wrapper that runs `verify-remote.sh` on R720 | -| `enable-auto-deploy.sh` | dev workstation | restores `.forgejo/workflows/` if disabled, uncomments push: trigger | -| `reset-vault.sh` | dev workstation | recovery from a vault password mismatch (destructive — re-prompts) | +| `lib.sh` | sourced by all | logging, error trap, idempotent state file, Forgejo API helpers | +| `bootstrap-local.sh` | operator's laptop | drives Ansible over SSH ; --ask-become-pass on the R720 | +| `bootstrap-r720.sh` | R720 directly (sudo) | drives Ansible locally (connection: local) ; no SSH, no sudo prompts | +| `verify-local.sh` | laptop | read-only checks of local + remote state | +| `verify-r720.sh` | R720 (sudo) | read-only checks of R720 state | +| `enable-auto-deploy.sh` | laptop | restores `.forgejo/workflows/`, uncomments push: trigger | +| `reset-vault.sh` | laptop | recovery from vault password mismatch (destructive) | | `.env.example` | template | copy to `.env`, fill in, gitignored | -## State file +## Two scripts, one Ansible -Each host keeps a per-host state file with `phase=DONE timestamp` -lines so a re-run is a no-op for completed phases : +Both `bootstrap-local.sh` and `bootstrap-r720.sh` end up running +the **same two playbooks** : + + * `playbooks/bootstrap_runner.yml` — Incus profiles + forgejo-runner + Incus access + runner registration with `incus` label + * `playbooks/haproxy.yml` — edge HAProxy container + dehydrated + Let's Encrypt issuance for veza.fr / staging.veza.fr / talas.fr / + forgejo.talas.group + +The difference is the **inventory** : + + * laptop → `inventory/staging.yml` (SSH to R720, --ask-become-pass) + * R720 → `inventory/local.yml` (connection: local, already root) + +Pick whichever is convenient. The state files are independent (laptop +keeps state under `.git/talas-bootstrap/`, R720 under `/var/lib/talas/`), +so running both at different times doesn't double-do anything. + +## State files ``` -local : /.git/talas-bootstrap/local.state -R720 : /var/lib/talas/bootstrap.state +laptop : /.git/talas-bootstrap/local.state +R720 : /var/lib/talas/r720-bootstrap.state ``` +`phase=DONE timestamp` per completed phase. Re-run skips DONE phases. To force a phase re-run, delete its line : + ```bash sed -i '/^vault=/d' .git/talas-bootstrap/local.state ``` -## Inter-script communication - -`bootstrap-local.sh` invokes `bootstrap-remote.sh` over SSH by -concatenating `lib.sh` + `bootstrap-remote.sh` and piping into -`sudo -E bash -s` on the R720. The remote script : - -* writes `/var/log/talas-bootstrap.log` on R720 (persistent) -* emits `>>>PHASE::<<<` markers on stdout -* the local script `tee`s those to stderr so the operator sees - remote progress in the same terminal as the local logs - -Resumability : the state file means a SSH disconnect or partial -failure leaves the work it managed to complete marked DONE. Re-run -`bootstrap-local.sh` and it picks up where it stopped. - -## Quickstart +## Quickstart — from the laptop ```bash cd /home/senke/git/talas/veza/scripts/bootstrap cp .env.example .env -$EDITOR .env # fill in FORGEJO_ADMIN_TOKEN at minimum +vim .env # at minimum : FORGEJO_ADMIN_TOKEN chmod +x *.sh -# Set up everything +# Set up everything end-to-end : ./bootstrap-local.sh -# Or skip phases you've already done +# Or skip phases you've already done : PHASE=4 ./bootstrap-local.sh -# Verify any time +# Verify any time (read-only) : ./verify-local.sh -ssh ansible@10.0.20.150 'sudo bash' < verify-remote.sh ``` +## Quickstart — directly on the R720 + +```bash +ssh srv-102v +cd /path/to/veza/scripts/bootstrap +cp .env.example .env +vim .env # FORGEJO_ADMIN_TOKEN at minimum +sudo ./bootstrap-r720.sh + +# Verify : +sudo ./verify-r720.sh +``` + +## Sudo on the R720 — the design choice + +The bash scripts **do not require NOPASSWD sudo** on the R720. Two +reasons : + +1. **Trust boundary** — NOPASSWD turns any compromise of the operator's + account into root on the host. Keeping the password requirement + means an attacker also needs to phish/keylog the sudo password. +2. **Ansible's `--ask-become-pass`** is fine for interactive runs. + The operator types the password ONCE per `bootstrap-local.sh` + invocation ; ansible holds it in memory and reuses for every + `become: true` task. No file written, no env var leaked. + +`pipelining = False` in `ansible.cfg` is what makes interactive +`--ask-become-pass` reliable (the previous `True` setting raced sudo's +TTY-driven prompt). + ## What each phase needs | Phase | Needs | |---|---| -| 1. preflight | git, ansible, dig, ssh, jq locally ; SSH to R720 ; DNS resolved (warning only if missing) | -| 2. vault | nothing ; will prompt for vault password and edit `vault.yml` from template | -| 3. forgejo | `FORGEJO_ADMIN_TOKEN` env var or in .env | -| 4. r720 | `FORGEJO_ADMIN_TOKEN` (used to fetch runner registration token) ; SSH to R720 with sudo | -| 5. haproxy | DNS public domains resolved + port 80 reachable from Internet ; ansible decryptable vault | -| 6. summary | nothing | +| 1. preflight | git, ansible, dig, ssh, jq locally ; SSH to R720 (laptop) ; DNS resolved (warning if missing) | +| 2. vault | nothing ; auto-generates JWT + 11 random passwords, prompts for vault password | +| 3. forgejo | `FORGEJO_ADMIN_TOKEN` (.env or env) — scopes : write:repository, read:repository | +| 4. ansible | sudo password on R720 (interactive ; not stored) | +| 5. summary | nothing | ## Troubleshooting -- **Phase 1 SSH fails** — verify `R720_HOST` + `R720_USER` in `.env`. - If you use an SSH config alias (e.g. `Host srv-102v` in - `~/.ssh/config`), set `R720_HOST=srv-102v` and either set - `R720_USER=` (empty, alias's User= wins) or match the alias's user. - Test manually : `ssh ${R720_USER}@${R720_HOST} /bin/true`. -- **Phase 2 `cannot decrypt vault.yml`** — the password in - `.vault-pass` doesn't match what was used to encrypt `vault.yml`. - - If you remember the original password, edit `.vault-pass` - (`echo "" > infra/ansible/.vault-pass ; chmod 0400 …`). - - Otherwise : `./reset-vault.sh` — destructive, re-prompts for - everything. -- **Phase 3 `Forgejo API unreachable`** — Forgejo on - `https://10.0.20.105:3000` serves a self-signed cert. Set - `FORGEJO_INSECURE=1` in `.env`. Once the edge HAProxy is up + LE has - issued `forgejo.talas.group`, switch to that URL and clear - `FORGEJO_INSECURE`. -- **Phase 3 `repo not found`** — set `FORGEJO_OWNER` to the actual - org/user owning the repo. Confirm with `git remote -v` (the path - segment after `host:port/`). -- **Phase 4 SSH timeout / sudo prompt** — passwordless sudo needed - for the SSH user. Add to `/etc/sudoers.d/talas-bootstrap` : - ``` - senke ALL=(ALL) NOPASSWD: /usr/bin/bash - ``` - Or run the remote half manually : - ``` - scp scripts/bootstrap/{lib.sh,bootstrap-remote.sh} srv-102v:/tmp/ - ssh srv-102v 'sudo FORGEJO_REGISTRATION_TOKEN= bash /tmp/bootstrap-remote.sh' - ``` -- **Phase 5 dehydrated fails** — port 80 must be reachable from - Internet for HTTP-01 (not blocked by ISP, NAT-forwarded). Test - from outside : `curl http://veza.fr/.well-known/acme-challenge/test` - should hit HAProxy's `letsencrypt_backend` (will 404, which is - fine ; what matters is reaching the R720). -- **`.forgejo/workflows/` is missing, only `workflows.disabled/` present** — - expected when the auto-trigger has been gated by renaming the dir. - `enable-auto-deploy.sh` restores it. +* **Phase 1 SSH fails** — verify `R720_HOST` + `R720_USER` in `.env`. + If using an SSH config alias, `R720_HOST=` and leave + `R720_USER=` empty. +* **Phase 2 cannot decrypt** — `./reset-vault.sh` (destructive, + re-prompts for everything). +* **Phase 3 Forgejo unreachable** — set `FORGEJO_INSECURE=1` for + self-signed cert on `https://10.0.20.105:3000`. Update to + `https://forgejo.talas.group` once edge HAProxy + LE is up. +* **Phase 3 token lacks scope** — token needs at minimum + `write:repository`. `write:admin` lets the script auto-create + the registry token ; without it, you'll be prompted to paste + one you create manually. +* **Phase 4 `Timeout waiting for privilege escalation prompt`** — + set `pipelining = False` in `infra/ansible/ansible.cfg`. The + current default is `False` ; revert if it's been changed. +* **Phase 4 dehydrated fails** — port 80 must be reachable from + Internet (HTTP-01 challenge). Test from an external host : + `curl http://veza.fr/`. If it doesn't reach the R720, configure + port forwarding 80 + 443 on your home router / ISP box. +* **Phase 4 Incus network not found** — group_vars defaults to + `net-veza`. The script auto-detects from forgejo's network on the + R720 ; if your bridge has a different name, set + `veza_incus_network` in `group_vars/staging.yml` (or + `inventory/local.yml` for the R720 case). ## After bootstrap -- Trigger 1st deploy manually via Forgejo UI : Actions → Veza deploy → Run workflow. -- Once green, run `./enable-auto-deploy.sh` to re-enable push-trigger. -- `verify-local.sh` + `verify-remote.sh` are safe to run any time. +* Trigger 1st deploy manually : Forgejo Actions UI → Veza deploy → Run workflow. +* Once green, run `./enable-auto-deploy.sh` to restore the + `push:main + tag:v*` triggers. +* `verify-{local,r720}.sh` are safe to run any time. diff --git a/scripts/bootstrap/bootstrap-local.sh b/scripts/bootstrap/bootstrap-local.sh index 701aa739c..54bbad5e4 100755 --- a/scripts/bootstrap/bootstrap-local.sh +++ b/scripts/bootstrap/bootstrap-local.sh @@ -1,137 +1,103 @@ #!/usr/bin/env bash -# bootstrap-local.sh — drive bootstrap from the operator's workstation. +# bootstrap-local.sh — run on the operator's laptop. Drives the +# bootstrap end-to-end via Ansible (no NOPASSWD sudo, no manual +# SSH-script-streaming). # -# Phases (each idempotent ; skipped if state file marks DONE) : -# 1. preflight — required tools, SSH to R720, DNS resolution -# 2. vault — render + encrypt group_vars/all/vault.yml, -# write .vault-pass -# 3. forgejo — set repo Secrets / Variables via Forgejo API -# 4. r720 — invoke bootstrap-remote.sh over SSH -# 5. haproxy — ansible-playbook playbooks/haproxy.yml, -# verify Let's Encrypt certs land -# 6. summary — final readiness report +# Phases (each idempotent ; resumable via PHASE=N) : +# 1. preflight — required local tools, SSH to R720, DNS +# 2. vault — render + encrypt vault.yml, write .vault-pass +# 3. forgejo — set repo Secrets / Variables via Forgejo API +# 4. ansible-bootstrap — single ansible-playbook run that does : +# * Incus profiles on R720 +# * forgejo-runner Incus socket + nesting + binary +# * forgejo-runner registered with `incus` label +# * HAProxy edge container + Let's Encrypt certs +# 5. summary # -# Resumable : -# PHASE=4 ./bootstrap-local.sh # restart at phase 4 -# -# Inputs (env vars ; can be set in your shell or in scripts/bootstrap/.env) : -# R720_HOST ssh target (default: 10.0.20.150) -# R720_USER ssh user (default: ansible) -# FORGEJO_API_URL default: https://forgejo.talas.group -# override with http://10.0.20.105:3000 if no DNS yet -# FORGEJO_OWNER default: talas +# Inputs (env vars or .env file in this dir) : +# R720_HOST ssh target (default: srv-102v) +# R720_USER ssh user (leave empty if alias has User=) +# FORGEJO_API_URL default: https://10.0.20.105:3000 +# FORGEJO_INSECURE 1 to skip TLS verify (default: 1 for LAN) +# FORGEJO_OWNER default: senke # FORGEJO_REPO default: veza -# FORGEJO_ADMIN_TOKEN MANDATORY (Forgejo UI → Settings → Applications) -# ALREADY_PUSHED set to "1" if origin/main already has the -# current HEAD ; skips the auto-push prompt +# FORGEJO_ADMIN_TOKEN MANDATORY (Forgejo Settings → Applications) +# +# Sudo on the R720 : NOT NOPASSWD. Ansible prompts the operator ONCE +# per run via --ask-become-pass. set -Eeuo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=lib.sh . "$SCRIPT_DIR/lib.sh" trap_errors -# Optional .env in the bootstrap dir for non-secret defaults. [[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env" -: "${R720_HOST:=10.0.20.150}" -: "${R720_USER:=ansible}" -: "${FORGEJO_API_URL:=https://forgejo.talas.group}" -: "${FORGEJO_OWNER:=talas}" +: "${R720_HOST:=srv-102v}" +: "${R720_USER:=}" +: "${FORGEJO_API_URL:=https://10.0.20.105:3000}" +: "${FORGEJO_INSECURE:=1}" +: "${FORGEJO_OWNER:=senke}" : "${FORGEJO_REPO:=veza}" -REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null) \ +REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel) \ || die "not in a git repo (or git missing)" VAULT_YML="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml" VAULT_EXAMPLE="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml.example" VAULT_PASS="$REPO_ROOT/infra/ansible/.vault-pass" -# State file lives under the repo so the local script doesn't need root. TALAS_STATE_DIR="$REPO_ROOT/.git/talas-bootstrap" TALAS_STATE_FILE="$TALAS_STATE_DIR/local.state" -# ============================================================================ -# Vault autofill helpers (used by phase 2) -# ============================================================================ +# SSH target = "user@host" or just "host" if R720_USER is empty +# (alias's User= line wins). +if [[ -n "$R720_USER" ]]; then SSH_TARGET="$R720_USER@$R720_HOST"; else SSH_TARGET="$R720_HOST"; fi -# Generate a URL-safe random string (no /=+ which break sed and yaml). +# ============================================================================ +# Vault helpers (used by phase 2) +# ============================================================================ _rand_token() { local len=${1:-32} openssl rand -base64 $((len * 2)) 2>/dev/null | tr -dc 'A-Za-z0-9' | head -c "$len" } -# Replace a single `vault_: ""` line with a generated value. -# Idempotent : if the line is already non-TODO, no-op. _autofill_field() { local file=$1 key=$2 value=$3 - # Escape sed delimiters in value (we use | as delimiter, so escape any |) local esc=${value//|/\\|} sed -i "s|^${key}: \". _autogen_jwt_keys() { local file=$1 - if ! grep -q '' "$file"; then - return 0 - fi + grep -q '' "$file" || return 0 info "generating RS256 JWT keypair" local priv pub priv=$(openssl genrsa 4096 2>/dev/null) || die "openssl genrsa failed" pub=$(echo "$priv" | openssl rsa -pubout 2>/dev/null) || die "openssl rsa -pubout failed" - local priv_b64 pub_b64 - priv_b64=$(echo "$priv" | base64 -w0) - pub_b64=$(echo "$pub" | base64 -w0) - _autofill_field "$file" vault_jwt_signing_key_b64 "$priv_b64" - _autofill_field "$file" vault_jwt_public_key_b64 "$pub_b64" - ok "JWT keys generated and inserted" + _autofill_field "$file" vault_jwt_signing_key_b64 "$(echo "$priv" | base64 -w0)" + _autofill_field "$file" vault_jwt_public_key_b64 "$(echo "$pub" | base64 -w0)" + ok "JWT keys generated" } -# Autofill all the vault fields whose value can be safely random-generated. -# Optional / external fields (smtp, hyperswitch, stripe, oauth_clients, -# sentry) are left as for the operator to either fill or skip. _autofill_vault_secrets() { - local file=$1 - local filled=() - - # Strong passwords (32 alphanumeric chars). - local pw_fields=( - vault_postgres_password - vault_postgres_replication_password - vault_redis_password - vault_rabbitmq_password - vault_minio_root_password - vault_chat_jwt_secret - vault_oauth_encryption_key - vault_stream_internal_api_key - ) - for k in "${pw_fields[@]}"; do + local file=$1 filled=() + for k in vault_postgres_password vault_postgres_replication_password \ + vault_redis_password vault_rabbitmq_password \ + vault_minio_root_password vault_chat_jwt_secret \ + vault_oauth_encryption_key vault_stream_internal_api_key; do if grep -q "^${k}: \" 0 )); then - ok "auto-generated ${#filled[@]} secret(s) : ${filled[*]}" - fi + grep -q '^vault_minio_access_key: " 0 )) && ok "auto-generated ${#filled[@]} secret(s) : ${filled[*]}" } # ============================================================================ @@ -140,45 +106,34 @@ _autofill_vault_secrets() { phase_1_preflight() { section "Phase 1 — Preflight" _current_phase=preflight - phase preflight START - - skip_if_done preflight "preflight" && { phase preflight DONE; return 0; } + skip_if_done preflight "preflight" && return 0 require_cmd git ansible ansible-vault dig curl ssh openssl base64 jq - require_file "$VAULT_EXAMPLE" - require_file "$REPO_ROOT/infra/ansible/playbooks/haproxy.yml" - require_file "$REPO_ROOT/infra/ansible/inventory/staging.yml" - info "Testing SSH to $R720_USER@$R720_HOST…" - if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "$R720_USER@$R720_HOST" /bin/true 2>/dev/null; then - TALAS_HINT="ensure your ssh key is in $R720_USER@$R720_HOST:~/.ssh/authorized_keys, then try ssh $R720_USER@$R720_HOST" - die "SSH to $R720_USER@$R720_HOST failed" - fi + info "SSH to $SSH_TARGET" + ssh -o ConnectTimeout=5 -o BatchMode=yes "$SSH_TARGET" /bin/true \ + || { TALAS_HINT="check ~/.ssh/config (Host $R720_HOST) ; key in agent ?" + die "SSH to $SSH_TARGET failed"; } ok "SSH OK" - info "Checking that incus is reachable on R720…" - if ! ssh "$R720_USER@$R720_HOST" "command -v incus >/dev/null && incus list >/dev/null 2>&1"; then - TALAS_HINT="run 'incus list' as $R720_USER on $R720_HOST manually ; verify the user is in the 'incus-admin' group" - die "incus on $R720_HOST not accessible by $R720_USER" - fi - ok "incus reachable" - - info "Checking DNS resolution for the public domains…" - local missing_dns=() - for d in veza.fr staging.veza.fr talas.fr forgejo.talas.group; do - if ! dig +short +time=2 +tries=1 "$d" @1.1.1.1 2>/dev/null | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then - missing_dns+=("$d") - fi - done - if (( ${#missing_dns[@]} > 0 )); then - warn "DNS not resolved for: ${missing_dns[*]}" - warn "Let's Encrypt (phase 5) will fail for those domains. Configure DNS first or expect partial cert issuance." + info "incus reachable on R720 (no sudo)" + if ssh "$SSH_TARGET" "incus list >/dev/null 2>&1"; then + ok "operator is in incus-admin group (no sudo needed)" else - ok "all 4 public domains resolve" + warn "operator can't `incus list` without sudo — fine, ansible will prompt for sudo password" fi + info "DNS resolution" + local missing=() + for d in veza.fr staging.veza.fr talas.fr forgejo.talas.group; do + dig +short +time=2 +tries=1 "$d" @1.1.1.1 2>/dev/null | grep -qE '^[0-9]+\.' \ + || missing+=("$d") + done + (( ${#missing[@]} > 0 )) \ + && warn "DNS missing for: ${missing[*]} — Let's Encrypt will fail for those" \ + || ok "all 4 public domains resolve" + mark_done preflight - phase preflight DONE } # ============================================================================ @@ -187,67 +142,41 @@ phase_1_preflight() { phase_2_vault() { section "Phase 2 — Local vault" _current_phase=vault - phase vault START - - if skip_if_done vault "vault setup"; then - phase vault DONE; return 0 - fi + skip_if_done vault "vault setup" && return 0 if [[ -f "$VAULT_YML" ]] && head -1 "$VAULT_YML" 2>/dev/null | grep -q '^\$ANSIBLE_VAULT'; then - info "vault.yml already encrypted — verifying password works" - [[ -f "$VAULT_PASS" ]] || die "vault.yml encrypted but $VAULT_PASS missing — re-create it manually" + info "vault.yml already encrypted" + [[ -f "$VAULT_PASS" ]] || die "vault.yml encrypted but $VAULT_PASS missing — recover with ./reset-vault.sh" else - if [[ -f "$VAULT_YML" ]]; then - warn "vault.yml exists in PLAINTEXT — will autofill remaining + encrypt" - else + if [[ ! -f "$VAULT_YML" ]]; then info "rendering vault.yml from example" cp "$VAULT_EXAMPLE" "$VAULT_YML" fi - _autogen_jwt_keys "$VAULT_YML" _autofill_vault_secrets "$VAULT_YML" - local remaining remaining=$(grep -cE ' 0 )); then - warn "$remaining placeholders left (optional fields ; safe to leave or fill later)" + warn "$remaining placeholders left (optional fields)" grep -n '&2 - local cont - prompt_value cont "blank these out and continue ? (y/n)" "y" - if [[ "${cont,,}" == "y" ]]; then - # Replace any line whose value still has "$VAULT_PASS" - chmod 0400 "$VAULT_PASS" - ok "wrote $VAULT_PASS" - # If vault.yml is plaintext, encrypt now. - if ! head -1 "$VAULT_YML" | grep -q '^\$ANSIBLE_VAULT'; then - info "encrypting vault.yml" - ansible-vault encrypt --vault-password-file "$VAULT_PASS" "$VAULT_YML" - ok "encrypted" + if [[ ! -f "$VAULT_PASS" ]]; then + local pw; prompt_password pw "choose a vault password (memorize it !)" + echo "$pw" > "$VAULT_PASS" + chmod 0400 "$VAULT_PASS" fi + ansible-vault encrypt --vault-password-file "$VAULT_PASS" "$VAULT_YML" + ok "encrypted" fi - - info "verifying we can decrypt" - if ! ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null 2>&1; then - TALAS_HINT="if you remember the password, edit $VAULT_PASS to match. Otherwise run scripts/bootstrap/reset-vault.sh to start over." - die "cannot decrypt $VAULT_YML with $VAULT_PASS — password mismatch" - fi - ok "vault decryption verified" - + ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null \ + || { TALAS_HINT="run ./reset-vault.sh to start over" + die "cannot decrypt $VAULT_YML"; } + ok "decryption verified" mark_done vault - phase vault DONE } # ============================================================================ @@ -256,299 +185,158 @@ phase_2_vault() { phase_3_forgejo() { section "Phase 3 — Forgejo Secrets + Variables" _current_phase=forgejo - phase forgejo START - - if skip_if_done forgejo "Forgejo provisioning"; then - phase forgejo DONE; return 0 - fi + skip_if_done forgejo "Forgejo provisioning" && return 0 require_env FORGEJO_ADMIN_TOKEN \ - "create at $FORGEJO_API_URL/-/user/settings/applications (scopes: write:repository + write:package, optionally write:admin to auto-create registry tokens)" + "create at $FORGEJO_API_URL/-/user/settings/applications (scopes: write:repository + write:package)" - local insecure=() - [[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insecure=(-k) + local insec=() + [[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insec=(-k) - info "checking Forgejo API reachability (no-auth /version probe)" - if ! curl -fsSL "${insecure[@]}" --max-time 10 \ - "$FORGEJO_API_URL/api/v1/version" >/dev/null 2>&1; then - TALAS_HINT="check FORGEJO_API_URL ($FORGEJO_API_URL) ; for self-signed certs set FORGEJO_INSECURE=1 in .env ; verify WireGuard if URL is on the LAN" - die "Forgejo API unreachable" - fi - ok "Forgejo API reachable" + info "API reachability (auth-free /version probe)" + curl -fsSL "${insec[@]}" --max-time 10 "$FORGEJO_API_URL/api/v1/version" >/dev/null \ + || die "Forgejo API unreachable at $FORGEJO_API_URL" + ok "reachable" - info "checking repo $FORGEJO_OWNER/$FORGEJO_REPO + token has write access" - if ! forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null 2>&1; then - TALAS_HINT="verify FORGEJO_OWNER + FORGEJO_REPO (currently $FORGEJO_OWNER/$FORGEJO_REPO) ; verify token scope includes read:repository" - die "repo $FORGEJO_OWNER/$FORGEJO_REPO not found or token lacks read:repository" - fi + info "repo $FORGEJO_OWNER/$FORGEJO_REPO + token write access" + forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null \ + || die "repo not found or token lacks read:repository" ok "repo + token OK" - # FORGEJO_REGISTRY_TOKEN — set once, then leave alone. Re-runs of - # phase 3 don't re-prompt unless the secret has been deleted in - # Forgejo UI, OR the operator sets FORCE_FORGEJO_REPROMPT=1. - # NB: Forgejo doesn't expose GET /actions/secrets/ — we list - # all secrets and grep by name. - local _secret_exists=0 - if forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \ - | jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null 2>&1; then - _secret_exists=1 - fi - if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _secret_exists == 1 )); then - ok "secret FORGEJO_REGISTRY_TOKEN already set (set FORCE_FORGEJO_REPROMPT=1 to replace)" + # Registry token : skip if already set ; else prompt. + local _exists=0 + forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \ + | jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null \ + && _exists=1 + if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _exists == 1 )); then + ok "FORGEJO_REGISTRY_TOKEN already set (FORCE_FORGEJO_REPROMPT=1 to replace)" else - local registry_token="" + local rtok="" if [[ -n "${FORGEJO_REGISTRY_TOKEN:-}" ]]; then - info "using FORGEJO_REGISTRY_TOKEN from environment" - registry_token="$FORGEJO_REGISTRY_TOKEN" + rtok="$FORGEJO_REGISTRY_TOKEN" else - info "trying to auto-create a registry token (needs write:admin scope on admin token)" - local resp - resp=$(forgejo_api POST "/users/$FORGEJO_OWNER/tokens" \ - --data "$(jq -nc --arg n "veza-deploy-registry-$(date +%s)" \ - --argjson s '["write:package", "read:package"]' \ - '{name: $n, scopes: $s}')" 2>/dev/null \ - || true) - registry_token=$(echo "$resp" | jq -r '.sha1 // empty' 2>/dev/null || true) - if [[ -z "$registry_token" ]]; then - warn "auto-create failed (admin token lacks write:admin or sudo)" - warn "create the token manually :" - warn " $FORGEJO_API_URL/-/user/settings/applications" - warn " → 'Generate New Token' → name 'veza-deploy-registry'" - warn " → scopes: write:package, read:package" - prompt_password registry_token "paste the token value (input hidden)" - else - ok "auto-created registry token (${#registry_token} chars)" - fi + warn "create the token manually at $FORGEJO_API_URL/-/user/settings/applications" + warn " → name: veza-deploy-registry, scopes: write:package + read:package" + prompt_password rtok "paste the token (input hidden)" fi - forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$registry_token" + forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$rtok" fi - # Vault password is always re-set from the current .vault-pass — cheap, - # idempotent, and survives a re-run after rotation. forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" ANSIBLE_VAULT_PASSWORD "$(cat "$VAULT_PASS")" - forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \ - "$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic" - + forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \ + "$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic" mark_done forgejo - phase forgejo DONE } # ============================================================================ -# Phase 4 — R720 remote bootstrap +# Phase 4 — single ansible-playbook bootstrap (no shell SSH plumbing) # ============================================================================ -phase_4_r720() { - section "Phase 4 — R720 remote bootstrap (Incus profiles + runner labels)" - _current_phase=r720 - phase r720 START +phase_4_ansible() { + section "Phase 4 — Ansible bootstrap (runner pipeline + edge HAProxy)" + _current_phase=ansible + skip_if_done ansible "ansible bootstrap" && return 0 - if skip_if_done r720 "R720 remote bootstrap"; then - phase r720 DONE; return 0 - fi - - require_env FORGEJO_ADMIN_TOKEN - info "fetching a runner registration token from Forgejo" - local reg_token - reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO") \ - || die "could not fetch runner registration token" - info "got registration token (${#reg_token} chars)" - - local remote_script="$SCRIPT_DIR/bootstrap-remote.sh" - local remote_lib="$SCRIPT_DIR/lib.sh" - require_file "$remote_script" - require_file "$remote_lib" - - # SSH target string handles both "user@host" and pure "host" (when - # the alias's User= line is the source of truth). - local ssh_target - if [[ -n "${R720_USER:-}" ]]; then - ssh_target="${R720_USER}@${R720_HOST}" - else - ssh_target="${R720_HOST}" - fi - - info "uploading lib.sh + bootstrap-remote.sh to $ssh_target:/tmp/talas-bootstrap/" - ssh "$ssh_target" "mkdir -p /tmp/talas-bootstrap" \ - || die "ssh mkdir failed (target: $ssh_target)" - scp -q "$remote_lib" "$remote_script" "$ssh_target:/tmp/talas-bootstrap/" \ - || die "scp failed (target: $ssh_target)" - ok "uploaded" - - info "running bootstrap-remote.sh over ssh -t (TTY for sudo prompt)" - info " → if sudo asks for a password, type it once at the prompt below" - # ssh -t allocates a TTY so sudo can prompt for the password. Set vars - # via env=… so they're available inside the sudo'd script (sudo -E - # only preserves explicit pre-existing env vars, not ones set on the - # ssh command line). The /var/log/talas-bootstrap.log on R720 keeps - # a copy of the output even if the SSH stream gets cut. - if ! ssh -t "$ssh_target" \ - "sudo env FORGEJO_REGISTRATION_TOKEN='$reg_token' \ - FORGEJO_API_URL='$FORGEJO_API_URL' \ - bash /tmp/talas-bootstrap/bootstrap-remote.sh"; then - TALAS_HINT="ssh to $ssh_target and tail /var/log/talas-bootstrap.log ; or set up passwordless sudo : echo '$R720_USER ALL=(ALL) NOPASSWD: /usr/bin/bash' | sudo tee /etc/sudoers.d/talas-bootstrap" - die "remote bootstrap failed" - fi - - # Cleanup uploaded scripts. - ssh "$ssh_target" "rm -rf /tmp/talas-bootstrap" || true - - mark_done r720 - phase r720 DONE -} - -# ============================================================================ -# Phase 5 — Edge HAProxy + Let's Encrypt -# ============================================================================ -phase_5_haproxy() { - section "Phase 5 — Edge HAProxy + Let's Encrypt certs" - _current_phase=haproxy - phase haproxy START - - if skip_if_done haproxy "haproxy + LE"; then - phase haproxy DONE; return 0 - fi - - cd "$REPO_ROOT/infra/ansible" - - # Ansible collections needed by the haproxy/deploy playbooks. - # ansible.cfg sets stdout_callback=yaml which lives in - # community.general — without it, ansible-playbook errors out - # immediately ("Invalid callback for stdout specified: yaml"). - info "ensuring ansible collections (community.general / .postgresql / .rabbitmq) are installed" + info "ensuring ansible collections (community.general / .postgresql / .rabbitmq)" for col in community.general community.postgresql community.rabbitmq; do - if ! ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col"; then - info "installing $col" - ansible-galaxy collection install "$col" >/dev/null \ - || die "ansible-galaxy collection install $col failed (network ? ~/.ansible/ writable ?)" - fi + ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col" \ + || ansible-galaxy collection install "$col" >/dev/null \ + || die "ansible-galaxy install $col failed" done ok "collections present" - # Compute SSH target the same way phase 4 does. - local ssh_target - if [[ -n "${R720_USER:-}" ]]; then - ssh_target="${R720_USER}@${R720_HOST}" - else - ssh_target="${R720_HOST}" - fi + require_env FORGEJO_ADMIN_TOKEN - # Detect if NOPASSWD sudo is configured ; if not, pass --ask-become-pass. - local become_flag=() - if ssh "$ssh_target" "sudo -n /bin/true" >/dev/null 2>&1; then - ok "passwordless sudo on R720 — running ansible without -K" - else - info "sudo on R720 needs a password — passing --ask-become-pass" - info " → ansible will prompt 'BECOME password:' below ; type your sudo password" - become_flag=(--ask-become-pass) - fi + # Fetch a runner registration token. + info "fetching runner registration token from Forgejo" + local reg_token + reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO") \ + || die "runner registration token fetch failed" - # Detect the Incus network actually present on the R720. Try in - # order : - # 1. No sudo (operator in `incus-admin` group) - # 2. sudo -n (NOPASSWD sudo configured) - # 3. Give up and let the playbook use the group_vars default - # Probe is via the existing forgejo container (whose network we - # know is the right one) ; fall back to listing managed bridges. + cd "$REPO_ROOT/infra/ansible" + + # Detect Incus network from forgejo container (no sudo needed if + # operator is in incus-admin group, otherwise skip — Ansible's own + # tasks will handle it via the host's incus binary). info "detecting Incus network on R720" - local detected_net="" - local _try_cmds=( - "incus config device get forgejo eth0 network 2>/dev/null" - "sudo -n incus config device get forgejo eth0 network 2>/dev/null" - ) - for cmd in "${_try_cmds[@]}"; do - detected_net=$(ssh "$ssh_target" "$cmd" 2>/dev/null | tr -d '[:space:]' || true) - [[ -n "$detected_net" && "$detected_net" != "None" ]] && break - done - if [[ -z "$detected_net" || "$detected_net" == "None" ]]; then - # Fallback : first managed bridge. - for cmd in \ - "incus network list -f csv 2>/dev/null | awk -F, '\$2==\"bridge\" && \$3==\"YES\" {print \$1; exit}'" \ - "sudo -n incus network list -f csv 2>/dev/null | awk -F, '\$2==\"bridge\" && \$3==\"YES\" {print \$1; exit}'" - do - detected_net=$(ssh "$ssh_target" "$cmd" 2>/dev/null | tr -d '[:space:]' || true) - [[ -n "$detected_net" ]] && break - done - fi - local extra_vars=() - if [[ -n "$detected_net" ]]; then - ok "Incus network detected : $detected_net" - extra_vars+=("--extra-vars" "veza_incus_network=$detected_net") - else - warn "could not auto-detect Incus network ; playbook will use the group_vars default (net-veza)" - fi + local detected_net + detected_net=$(ssh "$SSH_TARGET" \ + "incus config device get forgejo eth0 network 2>/dev/null" \ + | tr -d '[:space:]' || true) + [[ -z "$detected_net" || "$detected_net" == "None" ]] && detected_net="net-veza" + ok "Incus network : $detected_net" - info "running ansible-playbook playbooks/haproxy.yml (5–10 min)" - if ! ansible-playbook -i inventory/staging.yml playbooks/haproxy.yml \ + info "running bootstrap_runner.yml + haproxy.yml" + info " → ansible will prompt 'BECOME password:' below — type your sudo password ON THE R720" + info " (NOT a NOPASSWD-sudo bypass — your password is sent over SSH and never stored)" + + # Single ansible-playbook invocation runs both playbooks in sequence. + # --ask-become-pass prompts ONCE for sudo on the R720 ; that password + # is held in memory by ansible and reused for every become: true task + # in both playbooks. No NOPASSWD sudo needed. + if ! ansible-playbook \ + -i inventory/staging.yml \ --vault-password-file .vault-pass \ - "${become_flag[@]}" \ - "${extra_vars[@]}"; then - TALAS_HINT="check the ansible output above ; common issues : Incus network mismatch, port 80 blocked from Internet, DNS not yet propagated, sudo password rejected" - die "ansible-playbook haproxy.yml failed" + --ask-become-pass \ + -e forgejo_registration_token="$reg_token" \ + -e forgejo_api_url="$FORGEJO_API_URL" \ + -e veza_incus_network="$detected_net" \ + playbooks/bootstrap_runner.yml \ + playbooks/haproxy.yml; then + TALAS_HINT="check ansible output above ; common: wrong sudo password, port 80 not reachable from Internet (Let's Encrypt HTTP-01)" + die "ansible-playbook failed" fi info "verifying Let's Encrypt certs landed" local certs - certs=$(ssh "$R720_USER@$R720_HOST" "incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null" || true) - if [[ -z "$certs" ]]; then - warn "no certs found in /usr/local/etc/tls/haproxy/ on veza-haproxy" - warn "check /var/log/letsencrypt or run again — dehydrated retries on next playbook run" - return 1 - fi - ok "certs : $(echo "$certs" | tr '\n' ' ')" + certs=$(ssh "$SSH_TARGET" \ + "incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null" || true) + [[ -n "$certs" ]] \ + && ok "certs : $(echo "$certs" | tr '\n' ' ')" \ + || warn "no certs found — re-run, or check port 80 reachable from Internet" - mark_done haproxy - phase haproxy DONE + mark_done ansible } # ============================================================================ -# Phase 6 — Summary +# Phase 5 — Summary # ============================================================================ -phase_6_summary() { - section "Phase 6 — Summary" +phase_5_summary() { + section "Phase 5 — Summary" _current_phase=summary - phase summary START - cat <&2 + cat >&2 </dev/null 2>&1; then + info "installing ansible + python deps" + apt-get update -qq + apt-get install -y ansible python3-psycopg2 jq + fi + ok "ansible $(ansible --version | head -1 | awk '{print $2}')" + + # Required containers must already exist (forgejo + forgejo-runner). + incus info forgejo >/dev/null 2>&1 \ + || die "container 'forgejo' missing on this host — bootstrap forgejo first" + incus info forgejo-runner >/dev/null 2>&1 \ + || die "container 'forgejo-runner' missing on this host" + ok "forgejo + forgejo-runner containers present" + + # net-veza must exist as a network. + incus network show net-veza >/dev/null 2>&1 \ + || die "incus network 'net-veza' missing — create with: incus network create net-veza ipv4.address=10.0.20.1/24 ipv4.nat=true" + ok "net-veza network present" + + mark_done preflight +} + +# ============================================================================ +# Phase 2 — vault (assumes vault.yml + .vault-pass already exist on R720) +# ============================================================================ +phase_2_vault() { + section "Phase 2 — Vault check" + _current_phase=vault + skip_if_done vault "vault check" && return 0 + + require_file "$VAULT_YML" + require_file "$VAULT_PASS" + ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null \ + || die "cannot decrypt $VAULT_YML with $VAULT_PASS" + ok "vault present + decryptable" + mark_done vault +} + +# ============================================================================ +# Phase 3 — Forgejo Secrets + Variables (HTTPS API ; no Ansible needed) +# ============================================================================ +phase_3_forgejo() { + section "Phase 3 — Forgejo Secrets + Variables" + _current_phase=forgejo + skip_if_done forgejo "Forgejo provisioning" && return 0 + + require_env FORGEJO_ADMIN_TOKEN \ + "create at $FORGEJO_API_URL/-/user/settings/applications" + + local insec=() + [[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insec=(-k) + + curl -fsSL "${insec[@]}" --max-time 10 "$FORGEJO_API_URL/api/v1/version" >/dev/null \ + || die "Forgejo API unreachable at $FORGEJO_API_URL" + forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null \ + || die "repo not found / token lacks read:repository" + ok "Forgejo reachable + repo OK" + + local _exists=0 + forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \ + | jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null \ + && _exists=1 + if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _exists == 1 )); then + ok "FORGEJO_REGISTRY_TOKEN already set" + else + local rtok="${FORGEJO_REGISTRY_TOKEN:-}" + if [[ -z "$rtok" ]]; then + warn "create token at $FORGEJO_API_URL/-/user/settings/applications (write:package)" + prompt_password rtok "paste token (input hidden)" + fi + forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$rtok" + fi + + forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" ANSIBLE_VAULT_PASSWORD "$(cat "$VAULT_PASS")" + forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \ + "$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic" + mark_done forgejo +} + +# ============================================================================ +# Phase 4 — local Ansible (connection: local, no SSH, no --ask-become-pass) +# ============================================================================ +phase_4_ansible() { + section "Phase 4 — Ansible bootstrap (local)" + _current_phase=ansible + skip_if_done ansible "ansible bootstrap" && return 0 + + info "ensuring ansible collections" + for col in community.general community.postgresql community.rabbitmq; do + ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col" \ + || ansible-galaxy collection install "$col" >/dev/null \ + || die "ansible-galaxy install $col failed" + done + ok "collections present" + + require_env FORGEJO_ADMIN_TOKEN + + info "fetching runner registration token from Forgejo" + local reg_token + reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO") \ + || die "runner registration token fetch failed" + + cd "$REPO_ROOT/infra/ansible" + + # Detect network — we're root, sudo not needed. + local detected_net + detected_net=$(incus config device get forgejo eth0 network 2>/dev/null \ + | tr -d '[:space:]' || true) + [[ -z "$detected_net" || "$detected_net" == "None" ]] && detected_net="net-veza" + ok "Incus network : $detected_net" + + info "running bootstrap_runner.yml + haproxy.yml against inventory/local.yml" + if ! ansible-playbook \ + -i inventory/local.yml \ + --vault-password-file "$VAULT_PASS" \ + -e forgejo_registration_token="$reg_token" \ + -e forgejo_api_url="$FORGEJO_API_URL" \ + -e veza_incus_network="$detected_net" \ + playbooks/bootstrap_runner.yml \ + playbooks/haproxy.yml; then + TALAS_HINT="check ansible output above ; common: port 80 not reachable from Internet for LE HTTP-01" + die "ansible-playbook failed" + fi + + info "verifying Let's Encrypt certs" + local certs + certs=$(incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null || true) + [[ -n "$certs" ]] \ + && ok "certs : $(echo "$certs" | tr '\n' ' ')" \ + || warn "no certs — check port 80 reachable from Internet, then re-run" + mark_done ansible +} + +# ============================================================================ +# Phase 5 — Summary +# ============================================================================ +phase_5_summary() { + section "Phase 5 — Summary" + cat >&2 <>>PHASE::<<< markers on stdout for the local script. - -# lib.sh is concatenated upstream by bootstrap-local before this file is -# piped to bash. When run standalone, source it manually. -if ! declare -F info >/dev/null 2>&1; then - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - # shellcheck source=lib.sh - . "$SCRIPT_DIR/lib.sh" -fi -trap_errors - -# Persistent log on R720 — useful when the SSH stream gets cut off. -exec > >(tee -a /var/log/talas-bootstrap.log) 2>&1 - -: "${FORGEJO_API_URL:=https://forgejo.talas.group}" - -# ============================================================================ -# Phase R1 — Incus profiles -# ============================================================================ -remote_phase_1_profiles() { - section "R1 — Incus profiles (veza-app, veza-data)" - _current_phase=r1_profiles - phase r1_profiles START - - if skip_if_done r1_profiles "incus profiles"; then - phase r1_profiles DONE; return 0 - fi - - # Two profiles only — `veza-app` for app/edge containers, `veza-data` - # for the persistent data tier. Both empty by default (the operator - # adds resource limits / AppArmor rules later). The network device - # is NOT attached here ; playbooks pass `--network ` at launch - # so the caller controls which bridge the container lands on. - # An older revision created a `veza-net` profile too — drop it if - # it's there from a previous bootstrap, since it's redundant with - # the explicit --network flag. - for p in veza-app veza-data; do - if incus profile show "$p" >/dev/null 2>&1; then - ok "profile $p already exists" - else - incus profile create "$p" - ok "profile $p created (empty — operator may add limits later)" - fi - done - - if incus profile show veza-net >/dev/null 2>&1; then - if [[ "$(incus profile device list veza-net 2>/dev/null | wc -l)" -eq 0 ]]; then - warn "found legacy empty profile 'veza-net' — removing (network is set via --network on launch)" - incus profile delete veza-net 2>/dev/null || true - else - warn "legacy 'veza-net' profile has devices attached — leaving alone" - fi - fi - - mark_done r1_profiles - phase r1_profiles DONE -} - -# ============================================================================ -# Phase R2 — mount Incus socket into forgejo-runner container -# ============================================================================ -remote_phase_2_runner_socket() { - section "R2 — mount /var/lib/incus/unix.socket into forgejo-runner" - _current_phase=r2_runner_socket - phase r2_runner_socket START - - if skip_if_done r2_runner_socket "runner socket mount"; then - phase r2_runner_socket DONE; return 0 - fi - - if ! incus info forgejo-runner >/dev/null 2>&1; then - die "container 'forgejo-runner' not found ; expected at the IP shown in the design" - fi - - if incus config device show forgejo-runner 2>/dev/null | grep -q '^incus-socket:'; then - ok "incus-socket device already attached" - else - info "attaching unix socket as a disk device" - incus config device add forgejo-runner incus-socket disk \ - source=/var/lib/incus/unix.socket \ - path=/var/lib/incus/unix.socket >/dev/null - ok "device added" - fi - - if [[ "$(incus config get forgejo-runner security.nesting)" != "true" ]]; then - info "enabling security.nesting" - incus config set forgejo-runner security.nesting=true - ok "nesting=true ; restart required" - info "restarting forgejo-runner container" - incus restart forgejo-runner - sleep 3 - fi - - info "ensuring incus client binary is in the runner" - if incus exec forgejo-runner -- command -v incus >/dev/null 2>&1; then - ok "incus already in runner" - elif [[ -x /usr/bin/incus ]]; then - # Push the host's binary into the container — avoids apt repo - # issues (Debian 13 doesn't ship incus-client as a separate - # package, and the full `incus` package would also pull in the - # daemon which we don't want in a runner container). - info "pushing /usr/bin/incus from host into runner:/usr/local/bin/incus" - incus file push /usr/bin/incus forgejo-runner/usr/local/bin/incus --mode 0755 - ok "incus binary pushed" - else - die "no /usr/bin/incus on host AND none in runner — install incus on the host first" - fi - - info "smoke-test : runner can incus list" - if incus exec forgejo-runner -- incus list >/dev/null 2>&1; then - ok "runner has Incus access" - else - # Common cause : the runner's process can read /var/lib/incus/ - # unix.socket only if it has the right gid. The socket is owned - # root:incus-admin (or equivalent) on the host. Inside the - # container we either run as root (works) or need to add the - # runner user to a group with the same gid as host's incus-admin. - # We don't try to fix that here — it's runner-process-specific. - warn "runner cannot incus list as default user" - warn "this may be normal if the systemd unit runs as root inside" - warn "the container ; if not, add the runner user to a group with" - warn "the same gid as the host's incus-admin group" - fi - - mark_done r2_runner_socket - phase r2_runner_socket DONE -} - -# ============================================================================ -# Phase R3 — runner label = 'incus' -# ============================================================================ -remote_phase_3_runner_labels() { - section "R3 — forgejo-runner labelled 'incus,self-hosted'" - _current_phase=r3_runner_labels - phase r3_runner_labels START - - if skip_if_done r3_runner_labels "runner labels"; then - phase r3_runner_labels DONE; return 0 - fi - - require_env FORGEJO_REGISTRATION_TOKEN \ - "set on the SSH command-line by bootstrap-local.sh" - - # Find the runner config inside the container. Path varies by install - # method ; act_runner default is /etc/forgejo-runner/.runner. - local runner_cfg - runner_cfg=$(incus exec forgejo-runner -- bash -c ' - for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner; do - [[ -f "$f" ]] && echo "$f" && exit 0 - done - exit 1 - ' 2>/dev/null) || true - - local labels="" - if [[ -n "$runner_cfg" ]]; then - labels=$(incus exec forgejo-runner -- jq -r '.labels[]?' "$runner_cfg" 2>/dev/null \ - || incus exec forgejo-runner -- grep -oE '"labels":\[[^]]+' "$runner_cfg" 2>/dev/null \ - || echo "") - fi - - if echo "$labels" | grep -qw incus; then - ok "runner already has 'incus' label" - mark_done r3_runner_labels - phase r3_runner_labels DONE - return 0 - fi - - info "re-registering runner with labels incus,self-hosted" - - # Stop systemd unit, wipe old registration, re-register, start. - incus exec forgejo-runner -- systemctl stop forgejo-runner.service 2>/dev/null \ - || incus exec forgejo-runner -- systemctl stop act_runner.service 2>/dev/null \ - || warn "no systemd unit to stop ; will skip" - - [[ -n "$runner_cfg" ]] && incus exec forgejo-runner -- rm -f "$runner_cfg" - - # Detect runner binary name - local runner_bin - runner_bin=$(incus exec forgejo-runner -- bash -c ' - for b in forgejo-runner act_runner; do - command -v "$b" >/dev/null 2>&1 && echo "$b" && exit 0 - done - exit 1 - ' 2>/dev/null) || die "no forgejo-runner / act_runner binary found in container" - - incus exec forgejo-runner -- "$runner_bin" register \ - --no-interactive \ - --instance "$FORGEJO_API_URL" \ - --token "$FORGEJO_REGISTRATION_TOKEN" \ - --name "r720-incus" \ - --labels "incus,self-hosted" - - incus exec forgejo-runner -- systemctl start "$runner_bin.service" \ - || incus exec forgejo-runner -- systemctl start forgejo-runner.service - - ok "runner re-registered with incus label" - - mark_done r3_runner_labels - phase r3_runner_labels DONE -} - -# ============================================================================ -# Phase R4 — sanity, summary -# ============================================================================ -remote_phase_4_sanity() { - section "R4 — sanity check" - _current_phase=r4_sanity - phase r4_sanity START - - info "incus profiles :" - incus profile list -f csv | grep -E '^veza-' | awk -F, '{print " " $1}' - - info "forgejo-runner status :" - incus exec forgejo-runner -- systemctl is-active forgejo-runner.service 2>/dev/null \ - || incus exec forgejo-runner -- systemctl is-active act_runner.service 2>/dev/null \ - || warn "no active runner service — verify manually" - - info "forgejo container reachable from runner :" - if incus exec forgejo-runner -- curl -sSf -o /dev/null --max-time 5 \ - "$FORGEJO_API_URL" 2>/dev/null \ - || incus exec forgejo-runner -- curl -sSf -ko /dev/null --max-time 5 \ - https://10.0.20.105:3000/ 2>/dev/null \ - || incus exec forgejo-runner -- curl -sSf -o /dev/null --max-time 5 \ - http://10.0.20.105:3000/ 2>/dev/null; then - ok "runner can reach Forgejo" - else - warn "runner cannot reach Forgejo — check WireGuard / DNS / firewall" - fi - - mark_done r4_sanity - phase r4_sanity DONE -} - -main() { - local start=${PHASE:-1} - info "remote bootstrap starting at phase $start (log: /var/log/talas-bootstrap.log)" - - [[ $start -le 1 ]] && remote_phase_1_profiles - [[ $start -le 2 ]] && remote_phase_2_runner_socket - [[ $start -le 3 ]] && remote_phase_3_runner_labels - [[ $start -le 4 ]] && remote_phase_4_sanity - - ok "remote bootstrap done" -} - -main "$@" diff --git a/scripts/bootstrap/verify-local.sh b/scripts/bootstrap/verify-local.sh index f3fe427ee..3f3f62fce 100755 --- a/scripts/bootstrap/verify-local.sh +++ b/scripts/bootstrap/verify-local.sh @@ -100,12 +100,16 @@ check_with_hint "no placeholders left" \ "! ansible-vault view --vault-password-file $VAULT_PASS $VAULT_YML 2>/dev/null | grep -q '/dev/null 2>&1'" + "ssh -o BatchMode=yes $SSH_TARGET 'incus list >/dev/null 2>&1'" +check "R720 has bootstrap state file" \ + "ssh -o BatchMode=yes $SSH_TARGET '[[ -f /var/lib/talas/r720-bootstrap.state ]]'" section "DNS public domains" for d in veza.fr www.veza.fr staging.veza.fr talas.fr www.talas.fr forgejo.talas.group; do diff --git a/scripts/bootstrap/verify-r720.sh b/scripts/bootstrap/verify-r720.sh new file mode 100755 index 000000000..870189a3d --- /dev/null +++ b/scripts/bootstrap/verify-r720.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# verify-r720.sh — read-only checks on the R720 itself. +# +# Run as root : +# sudo bash scripts/bootstrap/verify-r720.sh +# +# Symmetric to verify-local.sh — exit code = number of failures. + +set -uo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +. "$SCRIPT_DIR/lib.sh" + +[[ $EUID -ne 0 ]] && warn "running without root — some checks may fail (incus list, ZFS)" + +declare -i PASS=0 FAIL=0 +check() { + local name=$1 cmd=$2 + if eval "$cmd" >/dev/null 2>&1; then ok "$name"; PASS+=1; else err "$name"; FAIL+=1; fi +} +check_with_hint() { + local name=$1 cmd=$2 hint=$3 + if eval "$cmd" >/dev/null 2>&1; then ok "$name"; PASS+=1 + else err "$name"; printf >&2 ' %shint:%s %s\n' "$_YELLOW" "$_RESET" "$hint"; FAIL+=1 + fi +} + +section "Host prerequisites" +check "incus binary" "command -v incus" +check "ansible binary" "command -v ansible" +check "zfs binary" "command -v zfs" +check "incus daemon reachable" "incus list" + +section "Incus profiles" +check_with_hint "profile veza-app" "incus profile show veza-app" \ + "rerun bootstrap-r720.sh phase 4" +check_with_hint "profile veza-data" "incus profile show veza-data" \ + "rerun bootstrap-r720.sh phase 4" + +section "Incus networks" +check_with_hint "net-veza network exists" "incus network show net-veza" \ + "incus network create net-veza ipv4.address=10.0.20.1/24 ipv4.nat=true" + +section "Forgejo" +check "forgejo container exists" "incus info forgejo" +check "forgejo container RUNNING" "incus list forgejo -f csv -c s 2>/dev/null | grep -q RUNNING" +check "forgejo HTTP responds" "curl -ksSf -o /dev/null --max-time 5 https://10.0.20.105:3000/api/v1/version || curl -sSf -o /dev/null --max-time 5 http://10.0.20.105:3000/api/v1/version" + +section "forgejo-runner" +check "runner container exists" "incus info forgejo-runner" +check "runner container RUNNING" "incus list forgejo-runner -f csv -c s 2>/dev/null | grep -q RUNNING" +check_with_hint "incus-socket device attached" \ + "incus config device show forgejo-runner | grep -q '^incus-socket:'" \ + "rerun bootstrap-r720.sh phase 4" +check_with_hint "security.nesting=true" \ + "[[ \$(incus config get forgejo-runner security.nesting) == true ]]" \ + "incus config set forgejo-runner security.nesting=true && incus restart forgejo-runner" +check_with_hint "incus binary in runner" \ + "incus exec forgejo-runner -- test -x /usr/local/bin/incus" \ + "rerun bootstrap-r720.sh phase 4" +check_with_hint "runner has 'incus' label" \ + "incus exec forgejo-runner -- bash -c 'for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner; do [[ -f \$f ]] && grep -q incus \$f && exit 0; done; exit 1'" \ + "rerun bootstrap-r720.sh phase 4 (will re-register)" +check_with_hint "runner systemd unit active" \ + "incus exec forgejo-runner -- bash -c 'systemctl is-active forgejo-runner.service 2>/dev/null || systemctl is-active act_runner.service'" \ + "incus exec forgejo-runner -- journalctl -u forgejo-runner -n 50" + +section "Edge HAProxy (post-haproxy.yml run)" +if incus info veza-haproxy >/dev/null 2>&1; then + check "veza-haproxy RUNNING" "incus list veza-haproxy -f csv -c s | grep -q RUNNING" + check_with_hint "haproxy systemd unit active" \ + "incus exec veza-haproxy -- systemctl is-active haproxy" \ + "incus exec veza-haproxy -- journalctl -u haproxy -n 50" + check_with_hint "haproxy.cfg validates" \ + "incus exec veza-haproxy -- haproxy -f /etc/haproxy/haproxy.cfg -c -q" \ + "rerun playbooks/haproxy.yml — config syntax error" + check_with_hint "Let's Encrypt cert dir has at least 1 .pem" \ + "incus exec veza-haproxy -- bash -c 'ls /usr/local/etc/tls/haproxy/*.pem 2>/dev/null | grep -q .'" \ + "verify port 80 reachable from Internet ; rerun playbooks/haproxy.yml" +else + warn "veza-haproxy doesn't exist yet — run bootstrap-r720.sh phase 4" +fi + +section "ZFS" +check "rpool exists" "zpool list rpool" + +section "State file" +if [[ -f "$TALAS_STATE_FILE" ]]; then + info "phases recorded :" + sed 's/^/ /' "$TALAS_STATE_FILE" +else + warn "no state file at $TALAS_STATE_FILE — bootstrap-r720.sh hasn't run yet" +fi + +section "Result" +if (( FAIL == 0 )); then + ok "$PASS / $((PASS + FAIL)) checks passed" + exit 0 +else + err "$FAIL FAIL out of $((PASS + FAIL)) ($PASS passed)" + exit 1 +fi diff --git a/scripts/bootstrap/verify-remote-ssh.sh b/scripts/bootstrap/verify-remote-ssh.sh deleted file mode 100755 index 77c51eb2d..000000000 --- a/scripts/bootstrap/verify-remote-ssh.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# verify-remote-ssh.sh — wrapper that scp's lib.sh + verify-remote.sh -# to the R720 then runs verify-remote.sh there. Saves the operator -# from having to clone the repo on the R720. -# -# Reads R720_HOST + R720_USER from .env or environment. - -set -Eeuo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -. "$SCRIPT_DIR/lib.sh" -trap_errors - -[[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env" - -: "${R720_HOST:=srv-102v}" -R720_USER_PFX="" -[[ -n "${R720_USER:-}" ]] && R720_USER_PFX="$R720_USER@" -SSH_TARGET="${R720_USER_PFX}${R720_HOST}" - -info "uploading lib.sh + verify-remote.sh to $SSH_TARGET:/tmp/" -scp -q "$SCRIPT_DIR/lib.sh" "$SCRIPT_DIR/verify-remote.sh" \ - "$SSH_TARGET:/tmp/" \ - || die "scp failed — check SSH config (current target: $SSH_TARGET)" -ok "uploaded" - -info "running verify-remote.sh as root" -# `sudo bash` so the state file at /var/lib/talas/bootstrap.state is -# accessible. If your account has incus group access without sudo, -# drop the `sudo`. -ssh -t "$SSH_TARGET" "sudo bash /tmp/verify-remote.sh" \ - || warn "verify-remote.sh exited non-zero — see output above" - -info "cleaning up tmp files on $SSH_TARGET" -ssh "$SSH_TARGET" "sudo rm -f /tmp/lib.sh /tmp/verify-remote.sh" || true - -ok "done" diff --git a/scripts/bootstrap/verify-remote.sh b/scripts/bootstrap/verify-remote.sh deleted file mode 100755 index 8910b3157..000000000 --- a/scripts/bootstrap/verify-remote.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env bash -# verify-remote.sh — read-only checks of R720 state (Incus profiles, -# runner labels, container reachability, certs). Run on the R720 itself -# (locally or via `ssh r720 verify-remote.sh`). -# -# Exit 0 if everything passes ; non-zero with a count of failures. - -set -uo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=lib.sh -. "$SCRIPT_DIR/lib.sh" - -: "${FORGEJO_API_URL:=https://forgejo.talas.group}" - -declare -i PASS=0 FAIL=0 - -check() { - local name=$1 cmd=$2 - if eval "$cmd" >/dev/null 2>&1; then - ok "$name" - PASS+=1 - else - err "$name" - FAIL+=1 - fi -} - -check_with_hint() { - local name=$1 cmd=$2 hint=$3 - if eval "$cmd" >/dev/null 2>&1; then - ok "$name" - PASS+=1 - else - err "$name" - printf >&2 ' %shint:%s %s\n' "$_YELLOW" "$_RESET" "$hint" - FAIL+=1 - fi -} - -section "R720 prerequisites" -check "incus available" "command -v incus" -check "zfs available" "command -v zfs" -check "incus list works" "incus list" - -section "Incus profiles" -for p in veza-app veza-data veza-net; do - check_with_hint "profile $p exists" \ - "incus profile show $p" \ - "run scripts/bootstrap/bootstrap-remote.sh as root" -done - -section "Forgejo container" -check "container 'forgejo' exists" "incus info forgejo" -check "container 'forgejo' RUNNING" \ - "incus list forgejo -f csv -c s 2>/dev/null | grep -q RUNNING" -check_with_hint "Forgejo HTTP responds on :3000" \ - "curl -ksSf -o /dev/null --max-time 5 http://10.0.20.105:3000/ || curl -ksSf -o /dev/null --max-time 5 https://10.0.20.105:3000/" \ - "incus exec forgejo -- systemctl status forgejo" - -section "Forgejo runner" -check "container 'forgejo-runner' exists" "incus info forgejo-runner" -check "container 'forgejo-runner' RUNNING" \ - "incus list forgejo-runner -f csv -c s 2>/dev/null | grep -q RUNNING" -check_with_hint "incus-socket device attached" \ - "incus config device show forgejo-runner | grep -q '^incus-socket:'" \ - "PHASE=2 sudo bash scripts/bootstrap/bootstrap-remote.sh" -check_with_hint "security.nesting=true" \ - "[[ \$(incus config get forgejo-runner security.nesting) == true ]]" \ - "incus config set forgejo-runner security.nesting=true && incus restart forgejo-runner" -check_with_hint "incus-client installed in runner" \ - "incus exec forgejo-runner -- command -v incus" \ - "incus exec forgejo-runner -- apt install -y incus-client" -check_with_hint "runner can incus list (socket reachable)" \ - "incus exec forgejo-runner -- incus list" \ - "verify the unix-socket disk device + nesting" -check_with_hint "runner config has 'incus' label" \ - "incus exec forgejo-runner -- bash -c 'for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner ; do [[ -f \$f ]] && grep -q incus \$f && exit 0 ; done ; exit 1'" \ - "PHASE=3 sudo bash scripts/bootstrap/bootstrap-remote.sh" -check_with_hint "runner systemd unit active" \ - "incus exec forgejo-runner -- bash -c 'systemctl is-active forgejo-runner.service 2>/dev/null || systemctl is-active act_runner.service'" \ - "incus exec forgejo-runner -- journalctl -u forgejo-runner -n 50" - -section "Edge HAProxy (only after running playbooks/haproxy.yml)" -if incus info veza-haproxy >/dev/null 2>&1; then - check "container 'veza-haproxy' RUNNING" \ - "incus list veza-haproxy -f csv -c s | grep -q RUNNING" - check_with_hint "haproxy systemd unit active" \ - "incus exec veza-haproxy -- systemctl is-active haproxy" \ - "incus exec veza-haproxy -- journalctl -u haproxy -n 50" - check_with_hint "haproxy.cfg present" \ - "incus exec veza-haproxy -- test -f /etc/haproxy/haproxy.cfg" \ - "ansible-playbook -i inventory/staging.yml playbooks/haproxy.yml" - check_with_hint "haproxy.cfg passes self-validation" \ - "incus exec veza-haproxy -- haproxy -f /etc/haproxy/haproxy.cfg -c -q" \ - "config syntax error — re-run ansible-playbook to re-render" - check_with_hint "Let's Encrypt cert dir has at least 1 .pem" \ - "incus exec veza-haproxy -- bash -c 'ls /usr/local/etc/tls/haproxy/*.pem 2>/dev/null | wc -l | grep -q -E \"^[1-9]\"'" \ - "rerun ansible-playbook ; verify port 80 reachable from Internet for HTTP-01" -else - warn "container 'veza-haproxy' does not exist yet — run ansible-playbook playbooks/haproxy.yml" -fi - -section "ZFS state (snapshots tolerated)" -check "rpool exists" \ - "zpool list rpool" - -section "State file" -if [[ -f "$TALAS_STATE_FILE" ]]; then - info "phases recorded :" - cat "$TALAS_STATE_FILE" | sed 's/^/ /' -else - warn "no state file at $TALAS_STATE_FILE — bootstrap-remote.sh hasn't run yet" -fi - -section "Result" -if (( FAIL == 0 )); then - ok "$PASS / $((PASS + FAIL)) checks passed" - exit 0 -else - err "$FAIL FAIL out of $((PASS + FAIL)) ($PASS passed)" - exit 1 -fi