veza/scripts/bootstrap/bootstrap-r720.sh

226 lines
8.8 KiB
Bash
Raw Normal View History

refactor(bootstrap): everything via Ansible — no NOPASSWD, no SSH plumbing Rearchitecture after operator pushback : the previous design did too much in bash (SSH-streaming script chunks, manual sudo dance, NOPASSWD requirement). Ansible is the right tool. The shell scripts are now thin orchestrators handling the chicken-and-egg of vault + Forgejo CI provisioning, then calling ansible-playbook. Key principles : 1. NO NOPASSWD sudo on the R720. --ask-become-pass interactive, password held in ansible memory only for the run. 2. Two parallel scripts — one per host, fully self-contained. 3. Both run the SAME Ansible playbooks (bootstrap_runner.yml + haproxy.yml). Difference is the inventory. Files (new + replaced) : ansible.cfg pipelining=True → False. Required for --ask-become-pass to work reliably ; the previous setting raced sudo's prompt and timed out at 12s. playbooks/bootstrap_runner.yml (new) The Incus-host-side bootstrap, ported from the old scripts/bootstrap/bootstrap-remote.sh. Three plays : Phase 1 : ensure veza-app + veza-data profiles exist ; drop legacy empty veza-net profile. Phase 2 : forgejo-runner gets /var/lib/incus/unix.socket attached as a disk device, security.nesting=true, /usr/bin/incus pushed in as /usr/local/bin/incus, smoke-tested. Phase 3 : forgejo-runner registered with `incus,self-hosted` label (idempotent — skips if already labelled). Each task uses Ansible idioms (`incus_profile`, `incus_command` where they exist, `command:` with `failed_when` and explicit state-checking elsewhere). no_log on the registration token. inventory/local.yml (new) Inventory for `bootstrap-r720.sh` — connection: local instead of SSH+become. Same group structure as staging.yml ; container groups use community.general.incus connection plugin (the local incus binary, no remote). inventory/{staging,prod}.yml (modified) Added `forgejo_runner` group (target of bootstrap_runner.yml phase 3, reached via community.general.incus from the host). scripts/bootstrap/bootstrap-local.sh (rewritten) Five phases : preflight, vault, forgejo, ansible, summary. Phase 4 calls a single `ansible-playbook` with both bootstrap_runner.yml + haproxy.yml in sequence. --ask-become-pass : ansible prompts ONCE for sudo, holds in memory, reuses for every become: true task. scripts/bootstrap/bootstrap-r720.sh (new) Symmetric to bootstrap-local.sh but runs as root on the R720. No SSH preflight, no --ask-become-pass (already root). Same Ansible playbooks, inventory/local.yml. scripts/bootstrap/verify-r720.sh (new — replaces verify-remote) Read-only checks of R720 state. Run as root locally on the R720. scripts/bootstrap/verify-local.sh (modified) Cross-host SSH check now fits the env-var-driven SSH_TARGET pattern (R720_USER may be empty if the alias has User=). scripts/bootstrap/{bootstrap-remote.sh, verify-remote.sh, verify-remote-ssh.sh} (DELETED) Replaced by playbooks/bootstrap_runner.yml + verify-r720.sh. README.md (rewritten) Documents the parallel-script architecture, the no-NOPASSWD-sudo design choice (--ask-become-pass), each phase's needs, and a refreshed troubleshooting list. State files unchanged in shape : laptop : .git/talas-bootstrap/local.state R720 : /var/lib/talas/r720-bootstrap.state --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 13:12:26 +00:00
#!/usr/bin/env bash
# bootstrap-r720.sh — runs DIRECTLY on the R720 (Incus host).
# Symmetric counterpart to bootstrap-local.sh : same ansible playbook,
# different inventory (connection: local instead of SSH+become).
#
# Use this script when :
# * The operator's laptop can't yet reach the R720 (no WireGuard up)
# * Disaster recovery — the laptop is unavailable
# * Faster iteration than driving Ansible over SSH
#
# Prerequisites :
# * The veza repo cloned on the R720 (or scp'd into /tmp)
# * Operator runs THIS script with sudo
# sudo bash scripts/bootstrap/bootstrap-r720.sh
# * Ansible + collections installed on R720 (script installs if missing)
#
# Phases mirror bootstrap-local.sh but :
# * SSH preflight skipped (we ARE the R720)
# * Forgejo provisioning still works (same API, same .env)
# * Ansible runs against inventory/local.yml (connection: local)
# * No --ask-become-pass — already root via sudo
set -Eeuo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
. "$SCRIPT_DIR/lib.sh"
trap_errors
[[ $EUID -ne 0 ]] && die "bootstrap-r720.sh must be run as root (use sudo)"
[[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env"
: "${FORGEJO_API_URL:=https://10.0.20.105:3000}"
: "${FORGEJO_INSECURE:=1}"
: "${FORGEJO_OWNER:=senke}"
: "${FORGEJO_REPO:=veza}"
REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel) \
|| die "not in a git repo (or git missing)"
VAULT_YML="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml"
VAULT_PASS="$REPO_ROOT/infra/ansible/.vault-pass"
TALAS_STATE_DIR="/var/lib/talas"
TALAS_STATE_FILE="$TALAS_STATE_DIR/r720-bootstrap.state"
# ============================================================================
# Phase 1 — preflight (R720-side)
# ============================================================================
phase_1_preflight() {
section "Phase 1 — Preflight (R720)"
_current_phase=preflight
skip_if_done preflight "preflight" && return 0
require_cmd git curl jq openssl
# Install ansible if missing.
if ! command -v ansible >/dev/null 2>&1; then
info "installing ansible + python deps"
apt-get update -qq
apt-get install -y ansible python3-psycopg2 jq
fi
ok "ansible $(ansible --version | head -1 | awk '{print $2}')"
# Required containers must already exist (forgejo + forgejo-runner).
incus info forgejo >/dev/null 2>&1 \
|| die "container 'forgejo' missing on this host — bootstrap forgejo first"
incus info forgejo-runner >/dev/null 2>&1 \
|| die "container 'forgejo-runner' missing on this host"
ok "forgejo + forgejo-runner containers present"
# net-veza must exist as a network.
incus network show net-veza >/dev/null 2>&1 \
|| die "incus network 'net-veza' missing — create with: incus network create net-veza ipv4.address=10.0.20.1/24 ipv4.nat=true"
ok "net-veza network present"
mark_done preflight
}
# ============================================================================
# Phase 2 — vault (assumes vault.yml + .vault-pass already exist on R720)
# ============================================================================
phase_2_vault() {
section "Phase 2 — Vault check"
_current_phase=vault
skip_if_done vault "vault check" && return 0
require_file "$VAULT_YML"
require_file "$VAULT_PASS"
ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null \
|| die "cannot decrypt $VAULT_YML with $VAULT_PASS"
ok "vault present + decryptable"
mark_done vault
}
# ============================================================================
# Phase 3 — Forgejo Secrets + Variables (HTTPS API ; no Ansible needed)
# ============================================================================
phase_3_forgejo() {
section "Phase 3 — Forgejo Secrets + Variables"
_current_phase=forgejo
skip_if_done forgejo "Forgejo provisioning" && return 0
require_env FORGEJO_ADMIN_TOKEN \
"create at $FORGEJO_API_URL/-/user/settings/applications"
local insec=()
[[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insec=(-k)
curl -fsSL "${insec[@]}" --max-time 10 "$FORGEJO_API_URL/api/v1/version" >/dev/null \
|| die "Forgejo API unreachable at $FORGEJO_API_URL"
forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null \
|| die "repo not found / token lacks read:repository"
ok "Forgejo reachable + repo OK"
local _exists=0
forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \
| jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null \
&& _exists=1
if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _exists == 1 )); then
ok "FORGEJO_REGISTRY_TOKEN already set"
else
local rtok="${FORGEJO_REGISTRY_TOKEN:-}"
if [[ -z "$rtok" ]]; then
warn "create token at $FORGEJO_API_URL/-/user/settings/applications (write:package)"
prompt_password rtok "paste token (input hidden)"
fi
forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$rtok"
fi
forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" ANSIBLE_VAULT_PASSWORD "$(cat "$VAULT_PASS")"
forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \
"$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic"
mark_done forgejo
}
# ============================================================================
# Phase 4 — local Ansible (connection: local, no SSH, no --ask-become-pass)
# ============================================================================
phase_4_ansible() {
section "Phase 4 — Ansible bootstrap (local)"
_current_phase=ansible
skip_if_done ansible "ansible bootstrap" && return 0
info "ensuring ansible collections"
for col in community.general community.postgresql community.rabbitmq; do
ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col" \
|| ansible-galaxy collection install "$col" >/dev/null \
|| die "ansible-galaxy install $col failed"
done
ok "collections present"
require_env FORGEJO_ADMIN_TOKEN
info "fetching runner registration token from Forgejo"
local reg_token
if reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO"); then
ok "got runner registration token (${#reg_token} chars)"
else
warn "auto-fetch failed — generate manually at :"
warn " $FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/settings/actions/runners"
prompt_password reg_token "paste runner registration token (input hidden)"
[[ -n "$reg_token" ]] || die "no token provided"
fi
refactor(bootstrap): everything via Ansible — no NOPASSWD, no SSH plumbing Rearchitecture after operator pushback : the previous design did too much in bash (SSH-streaming script chunks, manual sudo dance, NOPASSWD requirement). Ansible is the right tool. The shell scripts are now thin orchestrators handling the chicken-and-egg of vault + Forgejo CI provisioning, then calling ansible-playbook. Key principles : 1. NO NOPASSWD sudo on the R720. --ask-become-pass interactive, password held in ansible memory only for the run. 2. Two parallel scripts — one per host, fully self-contained. 3. Both run the SAME Ansible playbooks (bootstrap_runner.yml + haproxy.yml). Difference is the inventory. Files (new + replaced) : ansible.cfg pipelining=True → False. Required for --ask-become-pass to work reliably ; the previous setting raced sudo's prompt and timed out at 12s. playbooks/bootstrap_runner.yml (new) The Incus-host-side bootstrap, ported from the old scripts/bootstrap/bootstrap-remote.sh. Three plays : Phase 1 : ensure veza-app + veza-data profiles exist ; drop legacy empty veza-net profile. Phase 2 : forgejo-runner gets /var/lib/incus/unix.socket attached as a disk device, security.nesting=true, /usr/bin/incus pushed in as /usr/local/bin/incus, smoke-tested. Phase 3 : forgejo-runner registered with `incus,self-hosted` label (idempotent — skips if already labelled). Each task uses Ansible idioms (`incus_profile`, `incus_command` where they exist, `command:` with `failed_when` and explicit state-checking elsewhere). no_log on the registration token. inventory/local.yml (new) Inventory for `bootstrap-r720.sh` — connection: local instead of SSH+become. Same group structure as staging.yml ; container groups use community.general.incus connection plugin (the local incus binary, no remote). inventory/{staging,prod}.yml (modified) Added `forgejo_runner` group (target of bootstrap_runner.yml phase 3, reached via community.general.incus from the host). scripts/bootstrap/bootstrap-local.sh (rewritten) Five phases : preflight, vault, forgejo, ansible, summary. Phase 4 calls a single `ansible-playbook` with both bootstrap_runner.yml + haproxy.yml in sequence. --ask-become-pass : ansible prompts ONCE for sudo, holds in memory, reuses for every become: true task. scripts/bootstrap/bootstrap-r720.sh (new) Symmetric to bootstrap-local.sh but runs as root on the R720. No SSH preflight, no --ask-become-pass (already root). Same Ansible playbooks, inventory/local.yml. scripts/bootstrap/verify-r720.sh (new — replaces verify-remote) Read-only checks of R720 state. Run as root locally on the R720. scripts/bootstrap/verify-local.sh (modified) Cross-host SSH check now fits the env-var-driven SSH_TARGET pattern (R720_USER may be empty if the alias has User=). scripts/bootstrap/{bootstrap-remote.sh, verify-remote.sh, verify-remote-ssh.sh} (DELETED) Replaced by playbooks/bootstrap_runner.yml + verify-r720.sh. README.md (rewritten) Documents the parallel-script architecture, the no-NOPASSWD-sudo design choice (--ask-become-pass), each phase's needs, and a refreshed troubleshooting list. State files unchanged in shape : laptop : .git/talas-bootstrap/local.state R720 : /var/lib/talas/r720-bootstrap.state --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 13:12:26 +00:00
cd "$REPO_ROOT/infra/ansible"
# Detect network — we're root, sudo not needed.
local detected_net
detected_net=$(incus config device get forgejo eth0 network 2>/dev/null \
| tr -d '[:space:]' || true)
[[ -z "$detected_net" || "$detected_net" == "None" ]] && detected_net="net-veza"
ok "Incus network : $detected_net"
info "running bootstrap_runner.yml + haproxy.yml against inventory/local.yml"
if ! ansible-playbook \
-i inventory/local.yml \
--vault-password-file "$VAULT_PASS" \
-e forgejo_registration_token="$reg_token" \
-e forgejo_api_url="$FORGEJO_API_URL" \
-e veza_incus_network="$detected_net" \
playbooks/bootstrap_runner.yml \
playbooks/haproxy.yml; then
TALAS_HINT="check ansible output above ; common: port 80 not reachable from Internet for LE HTTP-01"
die "ansible-playbook failed"
fi
info "verifying Let's Encrypt certs"
local certs
certs=$(incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null || true)
[[ -n "$certs" ]] \
&& ok "certs : $(echo "$certs" | tr '\n' ' ')" \
|| warn "no certs — check port 80 reachable from Internet, then re-run"
mark_done ansible
}
# ============================================================================
# Phase 5 — Summary
# ============================================================================
phase_5_summary() {
section "Phase 5 — Summary"
cat >&2 <<EOF
${_GREEN}${_BOLD}✓ Bootstrap complete (R720 local).${_RESET}
Next steps :
1. Trigger a manual deploy : $FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/actions
2. Verify : sudo bash $SCRIPT_DIR/verify-r720.sh
State file : $TALAS_STATE_FILE
EOF
mark_done summary
}
main() {
local start=${PHASE:-1}
info "starting at phase $start (running on R720 as root)"
[[ $start -le 1 ]] && phase_1_preflight
[[ $start -le 2 ]] && phase_2_vault
[[ $start -le 3 ]] && phase_3_forgejo
[[ $start -le 4 ]] && phase_4_ansible
[[ $start -le 5 ]] && phase_5_summary
ok "ALL DONE"
}
main "$@"