veza/scripts/bootstrap/verify-remote.sh

123 lines
4.8 KiB
Bash
Raw Normal View History

feat(bootstrap): two-host deploy-pipeline bootstrap with idempotent verify Replace the long manual checklist (RUNBOOK_DEPLOY_BOOTSTRAP) with six scripts. Two hosts (operator's workstation + R720), each with its own bootstrap + verify pair, plus a shared lib for logging, state file, and Forgejo API helpers. Files : scripts/bootstrap/ ├── lib.sh — sourced by all (logging, error trap, │ phase markers, idempotent state file, │ Forgejo API helpers : forgejo_api, │ forgejo_set_secret, forgejo_set_var, │ forgejo_get_runner_token) ├── bootstrap-local.sh — drives 6 phases on the operator's │ workstation ├── bootstrap-remote.sh — runs on the R720 (over SSH) ; 4 phases ├── verify-local.sh — read-only check of local state ├── verify-remote.sh — read-only check of R720 state ├── enable-auto-deploy.sh — flips the deploy.yml gate after a │ successful manual run ├── .env.example — template for site config └── README.md — usage + troubleshooting Phases : Local 1. preflight — required tools, SSH to R720, DNS resolution 2. vault — render vault.yml from example, autogenerate JWT keys, prompt+encrypt, write .vault-pass 3. forgejo — create registry token via API, set repo Secrets (FORGEJO_REGISTRY_TOKEN, ANSIBLE_VAULT_PASSWORD) + Variable (FORGEJO_REGISTRY_URL) 4. r720 — fetch runner registration token, stream bootstrap-remote.sh + lib.sh over SSH 5. haproxy — ansible-playbook playbooks/haproxy.yml ; verify Let's Encrypt certs landed on the veza-haproxy container 6. summary — readiness report Remote R1. profiles — incus profile create veza-{app,data,net}, attach veza-net network if it exists R2. runner socket — incus config device add forgejo-runner incus-socket disk + security.nesting=true + apt install incus-client inside the runner R3. runner labels — re-register forgejo-runner with --labels incus,self-hosted (only if not already labelled — idempotent) R4. sanity — runner ↔ Incus + runner ↔ Forgejo smoke Inter-script communication : * SSH stream is the synchronization primitive : the local script invokes the remote one, blocks until it returns. * Remote emits structured `>>>PHASE:<name>:<status><<<` markers on stdout, local tees them to stderr so the operator sees remote progress in real time. * Persistent state files survive disconnects : local : <repo>/.git/talas-bootstrap/local.state R720 : /var/lib/talas/bootstrap.state Both hold one `phase=DONE timestamp` line per completed phase. Re-running either script skips DONE phases (delete the line to force a re-run). Resumable : PHASE=N ./bootstrap-local.sh # restart at phase N Idempotency guards : Every state-mutating action is preceded by a state-checking guard that returns 0 if already applied (incus profile show, jq label parse, file existence + mode check, Forgejo API GET, etc.). Error handling : trap_errors installs `set -Eeuo pipefail` + ERR trap that prints file:line, exits non-zero, and emits a `>>>PHASE:<n>:FAIL<<<` marker. Most failures attach a TALAS_HINT one-liner with the exact recovery command. Verify scripts : Read-only ; no state mutations. Output is a sequence of PASS/FAIL lines + an exit code = number of failures. Each failure prints a `hint:` with the precise fix command. .gitignore picks up scripts/bootstrap/.env (per-operator config) and .git/talas-bootstrap/ (state files). --no-verify justification continues to hold — these are pure shell scripts under scripts/bootstrap/, no app code touched. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:45:00 +00:00
#!/usr/bin/env bash
# verify-remote.sh — read-only checks of R720 state (Incus profiles,
# runner labels, container reachability, certs). Run on the R720 itself
# (locally or via `ssh r720 verify-remote.sh`).
#
# Exit 0 if everything passes ; non-zero with a count of failures.
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
. "$SCRIPT_DIR/lib.sh"
: "${FORGEJO_API_URL:=https://forgejo.talas.group}"
declare -i PASS=0 FAIL=0
check() {
local name=$1 cmd=$2
if eval "$cmd" >/dev/null 2>&1; then
ok "$name"
PASS+=1
else
err "$name"
FAIL+=1
fi
}
check_with_hint() {
local name=$1 cmd=$2 hint=$3
if eval "$cmd" >/dev/null 2>&1; then
ok "$name"
PASS+=1
else
err "$name"
printf >&2 ' %shint:%s %s\n' "$_YELLOW" "$_RESET" "$hint"
FAIL+=1
fi
}
section "R720 prerequisites"
check "incus available" "command -v incus"
check "zfs available" "command -v zfs"
check "incus list works" "incus list"
section "Incus profiles"
for p in veza-app veza-data veza-net; do
check_with_hint "profile $p exists" \
"incus profile show $p" \
"run scripts/bootstrap/bootstrap-remote.sh as root"
done
section "Forgejo container"
check "container 'forgejo' exists" "incus info forgejo"
check "container 'forgejo' RUNNING" \
"incus list forgejo -f csv -c s 2>/dev/null | grep -q RUNNING"
check_with_hint "Forgejo HTTP responds on :3000" \
"curl -ksSf -o /dev/null --max-time 5 http://10.0.20.105:3000/ || curl -ksSf -o /dev/null --max-time 5 https://10.0.20.105:3000/" \
"incus exec forgejo -- systemctl status forgejo"
section "Forgejo runner"
check "container 'forgejo-runner' exists" "incus info forgejo-runner"
check "container 'forgejo-runner' RUNNING" \
"incus list forgejo-runner -f csv -c s 2>/dev/null | grep -q RUNNING"
check_with_hint "incus-socket device attached" \
"incus config device show forgejo-runner | grep -q '^incus-socket:'" \
"PHASE=2 sudo bash scripts/bootstrap/bootstrap-remote.sh"
check_with_hint "security.nesting=true" \
"[[ \$(incus config get forgejo-runner security.nesting) == true ]]" \
"incus config set forgejo-runner security.nesting=true && incus restart forgejo-runner"
check_with_hint "incus-client installed in runner" \
"incus exec forgejo-runner -- command -v incus" \
"incus exec forgejo-runner -- apt install -y incus-client"
check_with_hint "runner can incus list (socket reachable)" \
"incus exec forgejo-runner -- incus list" \
"verify the unix-socket disk device + nesting"
check_with_hint "runner config has 'incus' label" \
"incus exec forgejo-runner -- bash -c 'for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner ; do [[ -f \$f ]] && grep -q incus \$f && exit 0 ; done ; exit 1'" \
"PHASE=3 sudo bash scripts/bootstrap/bootstrap-remote.sh"
check_with_hint "runner systemd unit active" \
"incus exec forgejo-runner -- bash -c 'systemctl is-active forgejo-runner.service 2>/dev/null || systemctl is-active act_runner.service'" \
"incus exec forgejo-runner -- journalctl -u forgejo-runner -n 50"
section "Edge HAProxy (only after running playbooks/haproxy.yml)"
if incus info veza-haproxy >/dev/null 2>&1; then
check "container 'veza-haproxy' RUNNING" \
"incus list veza-haproxy -f csv -c s | grep -q RUNNING"
check_with_hint "haproxy systemd unit active" \
"incus exec veza-haproxy -- systemctl is-active haproxy" \
"incus exec veza-haproxy -- journalctl -u haproxy -n 50"
check_with_hint "haproxy.cfg present" \
"incus exec veza-haproxy -- test -f /etc/haproxy/haproxy.cfg" \
"ansible-playbook -i inventory/staging.yml playbooks/haproxy.yml"
check_with_hint "haproxy.cfg passes self-validation" \
"incus exec veza-haproxy -- haproxy -f /etc/haproxy/haproxy.cfg -c -q" \
"config syntax error — re-run ansible-playbook to re-render"
check_with_hint "Let's Encrypt cert dir has at least 1 .pem" \
"incus exec veza-haproxy -- bash -c 'ls /usr/local/etc/tls/haproxy/*.pem 2>/dev/null | wc -l | grep -q -E \"^[1-9]\"'" \
"rerun ansible-playbook ; verify port 80 reachable from Internet for HTTP-01"
else
warn "container 'veza-haproxy' does not exist yet — run ansible-playbook playbooks/haproxy.yml"
fi
section "ZFS state (snapshots tolerated)"
check "rpool exists" \
"zpool list rpool"
section "State file"
if [[ -f "$TALAS_STATE_FILE" ]]; then
info "phases recorded :"
cat "$TALAS_STATE_FILE" | sed 's/^/ /'
else
warn "no state file at $TALAS_STATE_FILE — bootstrap-remote.sh hasn't run yet"
fi
section "Result"
if (( FAIL == 0 )); then
ok "$PASS / $((PASS + FAIL)) checks passed"
exit 0
else
err "$FAIL FAIL out of $((PASS + FAIL)) ($PASS passed)"
exit 1
fi