#!/usr/bin/env bash # bootstrap-local.sh — drive bootstrap from the operator's workstation. # # Phases (each idempotent ; skipped if state file marks DONE) : # 1. preflight — required tools, SSH to R720, DNS resolution # 2. vault — render + encrypt group_vars/all/vault.yml, # write .vault-pass # 3. forgejo — set repo Secrets / Variables via Forgejo API # 4. r720 — invoke bootstrap-remote.sh over SSH # 5. haproxy — ansible-playbook playbooks/haproxy.yml, # verify Let's Encrypt certs land # 6. summary — final readiness report # # Resumable : # PHASE=4 ./bootstrap-local.sh # restart at phase 4 # # Inputs (env vars ; can be set in your shell or in scripts/bootstrap/.env) : # R720_HOST ssh target (default: 10.0.20.150) # R720_USER ssh user (default: ansible) # FORGEJO_API_URL default: https://forgejo.talas.group # override with http://10.0.20.105:3000 if no DNS yet # FORGEJO_OWNER default: talas # FORGEJO_REPO default: veza # FORGEJO_ADMIN_TOKEN MANDATORY (Forgejo UI → Settings → Applications) # ALREADY_PUSHED set to "1" if origin/main already has the # current HEAD ; skips the auto-push prompt set -Eeuo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=lib.sh . "$SCRIPT_DIR/lib.sh" trap_errors # Optional .env in the bootstrap dir for non-secret defaults. [[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env" : "${R720_HOST:=10.0.20.150}" : "${R720_USER:=ansible}" : "${FORGEJO_API_URL:=https://forgejo.talas.group}" : "${FORGEJO_OWNER:=talas}" : "${FORGEJO_REPO:=veza}" REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null) \ || die "not in a git repo (or git missing)" VAULT_YML="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml" VAULT_EXAMPLE="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml.example" VAULT_PASS="$REPO_ROOT/infra/ansible/.vault-pass" # State file lives under the repo so the local script doesn't need root. TALAS_STATE_DIR="$REPO_ROOT/.git/talas-bootstrap" TALAS_STATE_FILE="$TALAS_STATE_DIR/local.state" # ============================================================================ # Vault autofill helpers (used by phase 2) # ============================================================================ # Generate a URL-safe random string (no /=+ which break sed and yaml). _rand_token() { local len=${1:-32} openssl rand -base64 $((len * 2)) 2>/dev/null | tr -dc 'A-Za-z0-9' | head -c "$len" } # Replace a single `vault_: ""` line with a generated value. # Idempotent : if the line is already non-TODO, no-op. _autofill_field() { local file=$1 key=$2 value=$3 # Escape sed delimiters in value (we use | as delimiter, so escape any |) local esc=${value//|/\\|} sed -i "s|^${key}: \". _autogen_jwt_keys() { local file=$1 if ! grep -q '' "$file"; then return 0 fi info "generating RS256 JWT keypair" local priv pub priv=$(openssl genrsa 4096 2>/dev/null) || die "openssl genrsa failed" pub=$(echo "$priv" | openssl rsa -pubout 2>/dev/null) || die "openssl rsa -pubout failed" local priv_b64 pub_b64 priv_b64=$(echo "$priv" | base64 -w0) pub_b64=$(echo "$pub" | base64 -w0) _autofill_field "$file" vault_jwt_signing_key_b64 "$priv_b64" _autofill_field "$file" vault_jwt_public_key_b64 "$pub_b64" ok "JWT keys generated and inserted" } # Autofill all the vault fields whose value can be safely random-generated. # Optional / external fields (smtp, hyperswitch, stripe, oauth_clients, # sentry) are left as for the operator to either fill or skip. _autofill_vault_secrets() { local file=$1 local filled=() # Strong passwords (32 alphanumeric chars). local pw_fields=( vault_postgres_password vault_postgres_replication_password vault_redis_password vault_rabbitmq_password vault_minio_root_password vault_chat_jwt_secret vault_oauth_encryption_key vault_stream_internal_api_key ) for k in "${pw_fields[@]}"; do if grep -q "^${k}: \" 0 )); then ok "auto-generated ${#filled[@]} secret(s) : ${filled[*]}" fi } # ============================================================================ # Phase 1 — preflight # ============================================================================ phase_1_preflight() { section "Phase 1 — Preflight" _current_phase=preflight phase preflight START skip_if_done preflight "preflight" && { phase preflight DONE; return 0; } require_cmd git ansible ansible-vault dig curl ssh openssl base64 jq require_file "$VAULT_EXAMPLE" require_file "$REPO_ROOT/infra/ansible/playbooks/haproxy.yml" require_file "$REPO_ROOT/infra/ansible/inventory/staging.yml" info "Testing SSH to $R720_USER@$R720_HOST…" if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "$R720_USER@$R720_HOST" /bin/true 2>/dev/null; then TALAS_HINT="ensure your ssh key is in $R720_USER@$R720_HOST:~/.ssh/authorized_keys, then try ssh $R720_USER@$R720_HOST" die "SSH to $R720_USER@$R720_HOST failed" fi ok "SSH OK" info "Checking that incus is reachable on R720…" if ! ssh "$R720_USER@$R720_HOST" "command -v incus >/dev/null && incus list >/dev/null 2>&1"; then TALAS_HINT="run 'incus list' as $R720_USER on $R720_HOST manually ; verify the user is in the 'incus-admin' group" die "incus on $R720_HOST not accessible by $R720_USER" fi ok "incus reachable" info "Checking DNS resolution for the public domains…" local missing_dns=() for d in veza.fr staging.veza.fr talas.fr forgejo.talas.group; do if ! dig +short +time=2 +tries=1 "$d" @1.1.1.1 2>/dev/null | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then missing_dns+=("$d") fi done if (( ${#missing_dns[@]} > 0 )); then warn "DNS not resolved for: ${missing_dns[*]}" warn "Let's Encrypt (phase 5) will fail for those domains. Configure DNS first or expect partial cert issuance." else ok "all 4 public domains resolve" fi mark_done preflight phase preflight DONE } # ============================================================================ # Phase 2 — vault # ============================================================================ phase_2_vault() { section "Phase 2 — Local vault" _current_phase=vault phase vault START if skip_if_done vault "vault setup"; then phase vault DONE; return 0 fi if [[ -f "$VAULT_YML" ]] && head -1 "$VAULT_YML" 2>/dev/null | grep -q '^\$ANSIBLE_VAULT'; then info "vault.yml already encrypted — verifying password works" [[ -f "$VAULT_PASS" ]] || die "vault.yml encrypted but $VAULT_PASS missing — re-create it manually" else if [[ -f "$VAULT_YML" ]]; then warn "vault.yml exists in PLAINTEXT — will autofill remaining + encrypt" else info "rendering vault.yml from example" cp "$VAULT_EXAMPLE" "$VAULT_YML" fi _autogen_jwt_keys "$VAULT_YML" _autofill_vault_secrets "$VAULT_YML" local remaining remaining=$(grep -cE ' 0 )); then warn "$remaining placeholders left (optional fields ; safe to leave or fill later)" grep -n '&2 local cont prompt_value cont "blank these out and continue ? (y/n)" "y" if [[ "${cont,,}" == "y" ]]; then # Replace any line whose value still has "$VAULT_PASS" chmod 0400 "$VAULT_PASS" ok "wrote $VAULT_PASS" # If vault.yml is plaintext, encrypt now. if ! head -1 "$VAULT_YML" | grep -q '^\$ANSIBLE_VAULT'; then info "encrypting vault.yml" ansible-vault encrypt --vault-password-file "$VAULT_PASS" "$VAULT_YML" ok "encrypted" fi fi info "verifying we can decrypt" if ! ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null 2>&1; then TALAS_HINT="if you remember the password, edit $VAULT_PASS to match. Otherwise run scripts/bootstrap/reset-vault.sh to start over." die "cannot decrypt $VAULT_YML with $VAULT_PASS — password mismatch" fi ok "vault decryption verified" mark_done vault phase vault DONE } # ============================================================================ # Phase 3 — Forgejo Secrets + Variables # ============================================================================ phase_3_forgejo() { section "Phase 3 — Forgejo Secrets + Variables" _current_phase=forgejo phase forgejo START if skip_if_done forgejo "Forgejo provisioning"; then phase forgejo DONE; return 0 fi require_env FORGEJO_ADMIN_TOKEN \ "create at $FORGEJO_API_URL/-/user/settings/applications (scopes: write:repository + write:package, optionally write:admin to auto-create registry tokens)" local insecure=() [[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insecure=(-k) info "checking Forgejo API reachability (no-auth /version probe)" if ! curl -fsSL "${insecure[@]}" --max-time 10 \ "$FORGEJO_API_URL/api/v1/version" >/dev/null 2>&1; then TALAS_HINT="check FORGEJO_API_URL ($FORGEJO_API_URL) ; for self-signed certs set FORGEJO_INSECURE=1 in .env ; verify WireGuard if URL is on the LAN" die "Forgejo API unreachable" fi ok "Forgejo API reachable" info "checking repo $FORGEJO_OWNER/$FORGEJO_REPO + token has write access" if ! forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null 2>&1; then TALAS_HINT="verify FORGEJO_OWNER + FORGEJO_REPO (currently $FORGEJO_OWNER/$FORGEJO_REPO) ; verify token scope includes read:repository" die "repo $FORGEJO_OWNER/$FORGEJO_REPO not found or token lacks read:repository" fi ok "repo + token OK" # FORGEJO_REGISTRY_TOKEN — set once, then leave alone. Re-runs of # phase 3 don't re-prompt unless the secret has been deleted in # Forgejo UI, OR the operator sets FORCE_FORGEJO_REPROMPT=1. # NB: Forgejo doesn't expose GET /actions/secrets/ — we list # all secrets and grep by name. local _secret_exists=0 if forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \ | jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null 2>&1; then _secret_exists=1 fi if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _secret_exists == 1 )); then ok "secret FORGEJO_REGISTRY_TOKEN already set (set FORCE_FORGEJO_REPROMPT=1 to replace)" else local registry_token="" if [[ -n "${FORGEJO_REGISTRY_TOKEN:-}" ]]; then info "using FORGEJO_REGISTRY_TOKEN from environment" registry_token="$FORGEJO_REGISTRY_TOKEN" else info "trying to auto-create a registry token (needs write:admin scope on admin token)" local resp resp=$(forgejo_api POST "/users/$FORGEJO_OWNER/tokens" \ --data "$(jq -nc --arg n "veza-deploy-registry-$(date +%s)" \ --argjson s '["write:package", "read:package"]' \ '{name: $n, scopes: $s}')" 2>/dev/null \ || true) registry_token=$(echo "$resp" | jq -r '.sha1 // empty' 2>/dev/null || true) if [[ -z "$registry_token" ]]; then warn "auto-create failed (admin token lacks write:admin or sudo)" warn "create the token manually :" warn " $FORGEJO_API_URL/-/user/settings/applications" warn " → 'Generate New Token' → name 'veza-deploy-registry'" warn " → scopes: write:package, read:package" prompt_password registry_token "paste the token value (input hidden)" else ok "auto-created registry token (${#registry_token} chars)" fi fi forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$registry_token" fi # Vault password is always re-set from the current .vault-pass — cheap, # idempotent, and survives a re-run after rotation. forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" ANSIBLE_VAULT_PASSWORD "$(cat "$VAULT_PASS")" forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \ "$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic" mark_done forgejo phase forgejo DONE } # ============================================================================ # Phase 4 — R720 remote bootstrap # ============================================================================ phase_4_r720() { section "Phase 4 — R720 remote bootstrap (Incus profiles + runner labels)" _current_phase=r720 phase r720 START if skip_if_done r720 "R720 remote bootstrap"; then phase r720 DONE; return 0 fi require_env FORGEJO_ADMIN_TOKEN info "fetching a runner registration token from Forgejo" local reg_token reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO") \ || die "could not fetch runner registration token" info "got registration token (${#reg_token} chars)" local remote_script="$SCRIPT_DIR/bootstrap-remote.sh" local remote_lib="$SCRIPT_DIR/lib.sh" require_file "$remote_script" require_file "$remote_lib" # SSH target string handles both "user@host" and pure "host" (when # the alias's User= line is the source of truth). local ssh_target if [[ -n "${R720_USER:-}" ]]; then ssh_target="${R720_USER}@${R720_HOST}" else ssh_target="${R720_HOST}" fi info "uploading lib.sh + bootstrap-remote.sh to $ssh_target:/tmp/talas-bootstrap/" ssh "$ssh_target" "mkdir -p /tmp/talas-bootstrap" \ || die "ssh mkdir failed (target: $ssh_target)" scp -q "$remote_lib" "$remote_script" "$ssh_target:/tmp/talas-bootstrap/" \ || die "scp failed (target: $ssh_target)" ok "uploaded" info "running bootstrap-remote.sh over ssh -t (TTY for sudo prompt)" info " → if sudo asks for a password, type it once at the prompt below" # ssh -t allocates a TTY so sudo can prompt for the password. Set vars # via env=… so they're available inside the sudo'd script (sudo -E # only preserves explicit pre-existing env vars, not ones set on the # ssh command line). The /var/log/talas-bootstrap.log on R720 keeps # a copy of the output even if the SSH stream gets cut. if ! ssh -t "$ssh_target" \ "sudo env FORGEJO_REGISTRATION_TOKEN='$reg_token' \ FORGEJO_API_URL='$FORGEJO_API_URL' \ bash /tmp/talas-bootstrap/bootstrap-remote.sh"; then TALAS_HINT="ssh to $ssh_target and tail /var/log/talas-bootstrap.log ; or set up passwordless sudo : echo '$R720_USER ALL=(ALL) NOPASSWD: /usr/bin/bash' | sudo tee /etc/sudoers.d/talas-bootstrap" die "remote bootstrap failed" fi # Cleanup uploaded scripts. ssh "$ssh_target" "rm -rf /tmp/talas-bootstrap" || true mark_done r720 phase r720 DONE } # ============================================================================ # Phase 5 — Edge HAProxy + Let's Encrypt # ============================================================================ phase_5_haproxy() { section "Phase 5 — Edge HAProxy + Let's Encrypt certs" _current_phase=haproxy phase haproxy START if skip_if_done haproxy "haproxy + LE"; then phase haproxy DONE; return 0 fi cd "$REPO_ROOT/infra/ansible" # Ansible collections needed by the haproxy/deploy playbooks. # ansible.cfg sets stdout_callback=yaml which lives in # community.general — without it, ansible-playbook errors out # immediately ("Invalid callback for stdout specified: yaml"). info "ensuring ansible collections (community.general / .postgresql / .rabbitmq) are installed" for col in community.general community.postgresql community.rabbitmq; do if ! ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col"; then info "installing $col" ansible-galaxy collection install "$col" >/dev/null \ || die "ansible-galaxy collection install $col failed (network ? ~/.ansible/ writable ?)" fi done ok "collections present" # Compute SSH target the same way phase 4 does. local ssh_target if [[ -n "${R720_USER:-}" ]]; then ssh_target="${R720_USER}@${R720_HOST}" else ssh_target="${R720_HOST}" fi # Detect if NOPASSWD sudo is configured ; if not, pass --ask-become-pass. local become_flag=() if ssh "$ssh_target" "sudo -n /bin/true" >/dev/null 2>&1; then ok "passwordless sudo on R720 — running ansible without -K" else info "sudo on R720 needs a password — passing --ask-become-pass" info " → ansible will prompt 'BECOME password:' below ; type your sudo password" become_flag=(--ask-become-pass) fi # Detect the Incus network actually present on the R720. The # group_vars default is `veza-net` but the operator's R720 may # already have a different bridge name (e.g. `incusbr0`). Probe # via the existing forgejo container (whose network we know # works) and fall back to `incus network list`. info "detecting Incus network on R720" local detected_net="" detected_net=$(ssh "$ssh_target" \ "sudo incus config device get forgejo eth0 network 2>/dev/null" \ | tr -d '[:space:]' || true) if [[ -z "$detected_net" || "$detected_net" == "None" ]]; then # Pick the first managed bridge that incus knows about. detected_net=$(ssh "$ssh_target" \ "sudo incus network list -f csv 2>/dev/null | awk -F, '\$2==\"bridge\" && \$3==\"YES\" {print \$1; exit}'" \ | tr -d '[:space:]' || true) fi local extra_vars=() if [[ -n "$detected_net" ]]; then ok "Incus network detected : $detected_net" extra_vars+=("--extra-vars" "veza_incus_network=$detected_net") else warn "could not auto-detect Incus network ; playbook will use the group_vars default" fi info "running ansible-playbook playbooks/haproxy.yml (5–10 min)" if ! ansible-playbook -i inventory/staging.yml playbooks/haproxy.yml \ --vault-password-file .vault-pass \ "${become_flag[@]}" \ "${extra_vars[@]}"; then TALAS_HINT="check the ansible output above ; common issues : Incus network mismatch, port 80 blocked from Internet, DNS not yet propagated, sudo password rejected" die "ansible-playbook haproxy.yml failed" fi info "verifying Let's Encrypt certs landed" local certs certs=$(ssh "$R720_USER@$R720_HOST" "incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null" || true) if [[ -z "$certs" ]]; then warn "no certs found in /usr/local/etc/tls/haproxy/ on veza-haproxy" warn "check /var/log/letsencrypt or run again — dehydrated retries on next playbook run" return 1 fi ok "certs : $(echo "$certs" | tr '\n' ' ')" mark_done haproxy phase haproxy DONE } # ============================================================================ # Phase 6 — Summary # ============================================================================ phase_6_summary() { section "Phase 6 — Summary" _current_phase=summary phase summary START cat <&2 ${_GREEN}${_BOLD}✓ Bootstrap complete.${_RESET} What works now : • Forgejo registry has the deploy secrets + variable. • forgejo-runner has the 'incus' label and Incus socket access. • veza-haproxy edge container is up with Let's Encrypt certs. What you can do next : 1. Trigger a manual deploy via Forgejo Actions UI : $FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/actions → "Veza deploy" → "Run workflow" → env=staging. 2. Once that run is green, re-enable auto-trigger : $SCRIPT_DIR/enable-auto-deploy.sh 3. Verify state any time : $SCRIPT_DIR/verify-local.sh ssh $R720_USER@$R720_HOST $SCRIPT_DIR/verify-remote.sh State file : $TALAS_STATE_FILE EOF mark_done summary phase summary DONE } # ============================================================================ # main # ============================================================================ main() { local start=${PHASE:-1} info "starting at phase $start" [[ $start -le 1 ]] && phase_1_preflight [[ $start -le 2 ]] && phase_2_vault [[ $start -le 3 ]] && phase_3_forgejo [[ $start -le 4 ]] && phase_4_r720 [[ $start -le 5 ]] && phase_5_haproxy [[ $start -le 6 ]] && phase_6_summary ok "ALL DONE" } main "$@"