veza/scripts/bootstrap/bootstrap-r720.sh
senke e58bafde9c fix(bootstrap): runner-token auto-fetch falls back to manual prompt on failure
The /api/v1/repos/{owner}/{repo}/actions/runners/registration-token
endpoint timed out (30s) on the operator's Forgejo. Cause unclear
(Forgejo version, scope, transient WG drop). Rather than block the
whole phase 4 on a flaky endpoint, downgrade the auto-fetch to
"try briefly, fall back to manual prompt" :

  forgejo_get_runner_token (lib.sh) :
    * Returns the token on stdout if successful, exit 0
    * Returns empty + exit 1 on failure (no `die`)
    * --max-time 10 instead of 30 — fail fast
    * 2>/dev/null on the curl + jq so spurious errors don't reach
      the user before our own warn message

  bootstrap-local.sh phase 4 :
    * if reg_token=$(forgejo_get_runner_token ...) → ok
    * else → warn + prompt with the exact UI URL where to
      generate a token manually
       :  $FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/settings/actions/runners

  bootstrap-r720.sh : symmetric change.

Operator workflow on failure :
  1. Open the Forgejo UI URL printed by the warn
  2. "Create new runner" → copy the registration token
  3. Paste at the prompt — bootstrap continues

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 15:20:06 +02:00

225 lines
8.8 KiB
Bash
Executable file

#!/usr/bin/env bash
# bootstrap-r720.sh — runs DIRECTLY on the R720 (Incus host).
# Symmetric counterpart to bootstrap-local.sh : same ansible playbook,
# different inventory (connection: local instead of SSH+become).
#
# Use this script when :
# * The operator's laptop can't yet reach the R720 (no WireGuard up)
# * Disaster recovery — the laptop is unavailable
# * Faster iteration than driving Ansible over SSH
#
# Prerequisites :
# * The veza repo cloned on the R720 (or scp'd into /tmp)
# * Operator runs THIS script with sudo
# sudo bash scripts/bootstrap/bootstrap-r720.sh
# * Ansible + collections installed on R720 (script installs if missing)
#
# Phases mirror bootstrap-local.sh but :
# * SSH preflight skipped (we ARE the R720)
# * Forgejo provisioning still works (same API, same .env)
# * Ansible runs against inventory/local.yml (connection: local)
# * No --ask-become-pass — already root via sudo
set -Eeuo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
. "$SCRIPT_DIR/lib.sh"
trap_errors
[[ $EUID -ne 0 ]] && die "bootstrap-r720.sh must be run as root (use sudo)"
[[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env"
: "${FORGEJO_API_URL:=https://10.0.20.105:3000}"
: "${FORGEJO_INSECURE:=1}"
: "${FORGEJO_OWNER:=senke}"
: "${FORGEJO_REPO:=veza}"
REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel) \
|| die "not in a git repo (or git missing)"
VAULT_YML="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml"
VAULT_PASS="$REPO_ROOT/infra/ansible/.vault-pass"
TALAS_STATE_DIR="/var/lib/talas"
TALAS_STATE_FILE="$TALAS_STATE_DIR/r720-bootstrap.state"
# ============================================================================
# Phase 1 — preflight (R720-side)
# ============================================================================
phase_1_preflight() {
section "Phase 1 — Preflight (R720)"
_current_phase=preflight
skip_if_done preflight "preflight" && return 0
require_cmd git curl jq openssl
# Install ansible if missing.
if ! command -v ansible >/dev/null 2>&1; then
info "installing ansible + python deps"
apt-get update -qq
apt-get install -y ansible python3-psycopg2 jq
fi
ok "ansible $(ansible --version | head -1 | awk '{print $2}')"
# Required containers must already exist (forgejo + forgejo-runner).
incus info forgejo >/dev/null 2>&1 \
|| die "container 'forgejo' missing on this host — bootstrap forgejo first"
incus info forgejo-runner >/dev/null 2>&1 \
|| die "container 'forgejo-runner' missing on this host"
ok "forgejo + forgejo-runner containers present"
# net-veza must exist as a network.
incus network show net-veza >/dev/null 2>&1 \
|| die "incus network 'net-veza' missing — create with: incus network create net-veza ipv4.address=10.0.20.1/24 ipv4.nat=true"
ok "net-veza network present"
mark_done preflight
}
# ============================================================================
# Phase 2 — vault (assumes vault.yml + .vault-pass already exist on R720)
# ============================================================================
phase_2_vault() {
section "Phase 2 — Vault check"
_current_phase=vault
skip_if_done vault "vault check" && return 0
require_file "$VAULT_YML"
require_file "$VAULT_PASS"
ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null \
|| die "cannot decrypt $VAULT_YML with $VAULT_PASS"
ok "vault present + decryptable"
mark_done vault
}
# ============================================================================
# Phase 3 — Forgejo Secrets + Variables (HTTPS API ; no Ansible needed)
# ============================================================================
phase_3_forgejo() {
section "Phase 3 — Forgejo Secrets + Variables"
_current_phase=forgejo
skip_if_done forgejo "Forgejo provisioning" && return 0
require_env FORGEJO_ADMIN_TOKEN \
"create at $FORGEJO_API_URL/-/user/settings/applications"
local insec=()
[[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insec=(-k)
curl -fsSL "${insec[@]}" --max-time 10 "$FORGEJO_API_URL/api/v1/version" >/dev/null \
|| die "Forgejo API unreachable at $FORGEJO_API_URL"
forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null \
|| die "repo not found / token lacks read:repository"
ok "Forgejo reachable + repo OK"
local _exists=0
forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \
| jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null \
&& _exists=1
if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _exists == 1 )); then
ok "FORGEJO_REGISTRY_TOKEN already set"
else
local rtok="${FORGEJO_REGISTRY_TOKEN:-}"
if [[ -z "$rtok" ]]; then
warn "create token at $FORGEJO_API_URL/-/user/settings/applications (write:package)"
prompt_password rtok "paste token (input hidden)"
fi
forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$rtok"
fi
forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" ANSIBLE_VAULT_PASSWORD "$(cat "$VAULT_PASS")"
forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \
"$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic"
mark_done forgejo
}
# ============================================================================
# Phase 4 — local Ansible (connection: local, no SSH, no --ask-become-pass)
# ============================================================================
phase_4_ansible() {
section "Phase 4 — Ansible bootstrap (local)"
_current_phase=ansible
skip_if_done ansible "ansible bootstrap" && return 0
info "ensuring ansible collections"
for col in community.general community.postgresql community.rabbitmq; do
ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col" \
|| ansible-galaxy collection install "$col" >/dev/null \
|| die "ansible-galaxy install $col failed"
done
ok "collections present"
require_env FORGEJO_ADMIN_TOKEN
info "fetching runner registration token from Forgejo"
local reg_token
if reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO"); then
ok "got runner registration token (${#reg_token} chars)"
else
warn "auto-fetch failed — generate manually at :"
warn " $FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/settings/actions/runners"
prompt_password reg_token "paste runner registration token (input hidden)"
[[ -n "$reg_token" ]] || die "no token provided"
fi
cd "$REPO_ROOT/infra/ansible"
# Detect network — we're root, sudo not needed.
local detected_net
detected_net=$(incus config device get forgejo eth0 network 2>/dev/null \
| tr -d '[:space:]' || true)
[[ -z "$detected_net" || "$detected_net" == "None" ]] && detected_net="net-veza"
ok "Incus network : $detected_net"
info "running bootstrap_runner.yml + haproxy.yml against inventory/local.yml"
if ! ansible-playbook \
-i inventory/local.yml \
--vault-password-file "$VAULT_PASS" \
-e forgejo_registration_token="$reg_token" \
-e forgejo_api_url="$FORGEJO_API_URL" \
-e veza_incus_network="$detected_net" \
playbooks/bootstrap_runner.yml \
playbooks/haproxy.yml; then
TALAS_HINT="check ansible output above ; common: port 80 not reachable from Internet for LE HTTP-01"
die "ansible-playbook failed"
fi
info "verifying Let's Encrypt certs"
local certs
certs=$(incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null || true)
[[ -n "$certs" ]] \
&& ok "certs : $(echo "$certs" | tr '\n' ' ')" \
|| warn "no certs — check port 80 reachable from Internet, then re-run"
mark_done ansible
}
# ============================================================================
# Phase 5 — Summary
# ============================================================================
phase_5_summary() {
section "Phase 5 — Summary"
cat >&2 <<EOF
${_GREEN}${_BOLD}✓ Bootstrap complete (R720 local).${_RESET}
Next steps :
1. Trigger a manual deploy : $FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/actions
2. Verify : sudo bash $SCRIPT_DIR/verify-r720.sh
State file : $TALAS_STATE_FILE
EOF
mark_done summary
}
main() {
local start=${PHASE:-1}
info "starting at phase $start (running on R720 as root)"
[[ $start -le 1 ]] && phase_1_preflight
[[ $start -le 2 ]] && phase_2_vault
[[ $start -le 3 ]] && phase_3_forgejo
[[ $start -le 4 ]] && phase_4_ansible
[[ $start -le 5 ]] && phase_5_summary
ok "ALL DONE"
}
main "$@"