veza/scripts/bootstrap/bootstrap-local.sh
senke edfa315947 fix(ansible): inventory uses srv-102v alias + bootstrap phase 5 detects sudo
Two issues from a real phase-5 run :

1. inventory/staging.yml + prod.yml hardcoded ansible_host=10.0.20.150
   That LAN IP isn't routed via the operator's WireGuard (only
   10.0.20.105/Forgejo is). Ansible timed out on TCP/22.
   Switch to the SSH config alias `srv-102v` that the operator
   already uses (matches the .env default). ansible_user=senke.
   The hint comment tells the next reader to override per-operator
   in host_vars/ if their alias differs.

2. Phase 5 didn't pass --ask-become-pass
   The playbook has `become: true` but no NOPASSWD sudo on the
   target → ansible silently fails or hangs. Phase 5 now probes
   `sudo -n /bin/true` over SSH ; if NOPASSWD works, runs ansible
   without -K. Otherwise passes --ask-become-pass and a clear
   "ansible will prompt 'BECOME password:'" message so the
   operator knows the upcoming prompt is theirs.

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 14:39:39 +02:00

519 lines
21 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# bootstrap-local.sh — drive bootstrap from the operator's workstation.
#
# Phases (each idempotent ; skipped if state file marks DONE) :
# 1. preflight — required tools, SSH to R720, DNS resolution
# 2. vault — render + encrypt group_vars/all/vault.yml,
# write .vault-pass
# 3. forgejo — set repo Secrets / Variables via Forgejo API
# 4. r720 — invoke bootstrap-remote.sh over SSH
# 5. haproxy — ansible-playbook playbooks/haproxy.yml,
# verify Let's Encrypt certs land
# 6. summary — final readiness report
#
# Resumable :
# PHASE=4 ./bootstrap-local.sh # restart at phase 4
#
# Inputs (env vars ; can be set in your shell or in scripts/bootstrap/.env) :
# R720_HOST ssh target (default: 10.0.20.150)
# R720_USER ssh user (default: ansible)
# FORGEJO_API_URL default: https://forgejo.talas.group
# override with http://10.0.20.105:3000 if no DNS yet
# FORGEJO_OWNER default: talas
# FORGEJO_REPO default: veza
# FORGEJO_ADMIN_TOKEN MANDATORY (Forgejo UI → Settings → Applications)
# ALREADY_PUSHED set to "1" if origin/main already has the
# current HEAD ; skips the auto-push prompt
set -Eeuo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
. "$SCRIPT_DIR/lib.sh"
trap_errors
# Optional .env in the bootstrap dir for non-secret defaults.
[[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env"
: "${R720_HOST:=10.0.20.150}"
: "${R720_USER:=ansible}"
: "${FORGEJO_API_URL:=https://forgejo.talas.group}"
: "${FORGEJO_OWNER:=talas}"
: "${FORGEJO_REPO:=veza}"
REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null) \
|| die "not in a git repo (or git missing)"
VAULT_YML="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml"
VAULT_EXAMPLE="$REPO_ROOT/infra/ansible/group_vars/all/vault.yml.example"
VAULT_PASS="$REPO_ROOT/infra/ansible/.vault-pass"
# State file lives under the repo so the local script doesn't need root.
TALAS_STATE_DIR="$REPO_ROOT/.git/talas-bootstrap"
TALAS_STATE_FILE="$TALAS_STATE_DIR/local.state"
# ============================================================================
# Vault autofill helpers (used by phase 2)
# ============================================================================
# Generate a URL-safe random string (no /=+ which break sed and yaml).
_rand_token() {
local len=${1:-32}
openssl rand -base64 $((len * 2)) 2>/dev/null | tr -dc 'A-Za-z0-9' | head -c "$len"
}
# Replace a single `vault_<key>: "<TODO ...>"` line with a generated value.
# Idempotent : if the line is already non-TODO, no-op.
_autofill_field() {
local file=$1 key=$2 value=$3
# Escape sed delimiters in value (we use | as delimiter, so escape any |)
local esc=${value//|/\\|}
sed -i "s|^${key}: \"<TODO[^\"]*\"|${key}: \"${esc}\"|" "$file"
}
# Auto-generate the RS256 JWT keypair if either key is still <TODO>.
_autogen_jwt_keys() {
local file=$1
if ! grep -q '<TODO: base64 of RS256 private PEM>' "$file"; then
return 0
fi
info "generating RS256 JWT keypair"
local priv pub
priv=$(openssl genrsa 4096 2>/dev/null) || die "openssl genrsa failed"
pub=$(echo "$priv" | openssl rsa -pubout 2>/dev/null) || die "openssl rsa -pubout failed"
local priv_b64 pub_b64
priv_b64=$(echo "$priv" | base64 -w0)
pub_b64=$(echo "$pub" | base64 -w0)
_autofill_field "$file" vault_jwt_signing_key_b64 "$priv_b64"
_autofill_field "$file" vault_jwt_public_key_b64 "$pub_b64"
ok "JWT keys generated and inserted"
}
# Autofill all the vault fields whose value can be safely random-generated.
# Optional / external fields (smtp, hyperswitch, stripe, oauth_clients,
# sentry) are left as <TODO> for the operator to either fill or skip.
_autofill_vault_secrets() {
local file=$1
local filled=()
# Strong passwords (32 alphanumeric chars).
local pw_fields=(
vault_postgres_password
vault_postgres_replication_password
vault_redis_password
vault_rabbitmq_password
vault_minio_root_password
vault_chat_jwt_secret
vault_oauth_encryption_key
vault_stream_internal_api_key
)
for k in "${pw_fields[@]}"; do
if grep -q "^${k}: \"<TODO" "$file"; then
_autofill_field "$file" "$k" "$(_rand_token 32)"
filled+=("$k")
fi
done
# MinIO access/secret keys (S3-style — alphanumeric, MinIO accepts these).
if grep -q '^vault_minio_access_key: "<TODO' "$file"; then
_autofill_field "$file" vault_minio_access_key "$(_rand_token 20)"
filled+=(vault_minio_access_key)
fi
if grep -q '^vault_minio_secret_key: "<TODO' "$file"; then
_autofill_field "$file" vault_minio_secret_key "$(_rand_token 40)"
filled+=(vault_minio_secret_key)
fi
# MinIO root user — fixed username.
if grep -q '^vault_minio_root_user: "<TODO' "$file"; then
_autofill_field "$file" vault_minio_root_user "veza-admin"
filled+=(vault_minio_root_user)
fi
if (( ${#filled[@]} > 0 )); then
ok "auto-generated ${#filled[@]} secret(s) : ${filled[*]}"
fi
}
# ============================================================================
# Phase 1 — preflight
# ============================================================================
phase_1_preflight() {
section "Phase 1 — Preflight"
_current_phase=preflight
phase preflight START
skip_if_done preflight "preflight" && { phase preflight DONE; return 0; }
require_cmd git ansible ansible-vault dig curl ssh openssl base64 jq
require_file "$VAULT_EXAMPLE"
require_file "$REPO_ROOT/infra/ansible/playbooks/haproxy.yml"
require_file "$REPO_ROOT/infra/ansible/inventory/staging.yml"
info "Testing SSH to $R720_USER@$R720_HOST"
if ! ssh -o ConnectTimeout=5 -o BatchMode=yes "$R720_USER@$R720_HOST" /bin/true 2>/dev/null; then
TALAS_HINT="ensure your ssh key is in $R720_USER@$R720_HOST:~/.ssh/authorized_keys, then try ssh $R720_USER@$R720_HOST"
die "SSH to $R720_USER@$R720_HOST failed"
fi
ok "SSH OK"
info "Checking that incus is reachable on R720…"
if ! ssh "$R720_USER@$R720_HOST" "command -v incus >/dev/null && incus list >/dev/null 2>&1"; then
TALAS_HINT="run 'incus list' as $R720_USER on $R720_HOST manually ; verify the user is in the 'incus-admin' group"
die "incus on $R720_HOST not accessible by $R720_USER"
fi
ok "incus reachable"
info "Checking DNS resolution for the public domains…"
local missing_dns=()
for d in veza.fr staging.veza.fr talas.fr forgejo.talas.group; do
if ! dig +short +time=2 +tries=1 "$d" @1.1.1.1 2>/dev/null | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
missing_dns+=("$d")
fi
done
if (( ${#missing_dns[@]} > 0 )); then
warn "DNS not resolved for: ${missing_dns[*]}"
warn "Let's Encrypt (phase 5) will fail for those domains. Configure DNS first or expect partial cert issuance."
else
ok "all 4 public domains resolve"
fi
mark_done preflight
phase preflight DONE
}
# ============================================================================
# Phase 2 — vault
# ============================================================================
phase_2_vault() {
section "Phase 2 — Local vault"
_current_phase=vault
phase vault START
if skip_if_done vault "vault setup"; then
phase vault DONE; return 0
fi
if [[ -f "$VAULT_YML" ]] && head -1 "$VAULT_YML" 2>/dev/null | grep -q '^\$ANSIBLE_VAULT'; then
info "vault.yml already encrypted — verifying password works"
[[ -f "$VAULT_PASS" ]] || die "vault.yml encrypted but $VAULT_PASS missing — re-create it manually"
else
if [[ -f "$VAULT_YML" ]]; then
warn "vault.yml exists in PLAINTEXT — will autofill remaining <TODO> + encrypt"
else
info "rendering vault.yml from example"
cp "$VAULT_EXAMPLE" "$VAULT_YML"
fi
_autogen_jwt_keys "$VAULT_YML"
_autofill_vault_secrets "$VAULT_YML"
local remaining
remaining=$(grep -cE '<TODO' "$VAULT_YML" || true)
if (( remaining > 0 )); then
warn "$remaining <TODO> placeholders left (optional fields ; safe to leave or fill later)"
grep -n '<TODO' "$VAULT_YML" >&2
local cont
prompt_value cont "blank these out and continue ? (y/n)" "y"
if [[ "${cont,,}" == "y" ]]; then
# Replace any line whose value still has <TODO with empty string ;
# for nested fields under vault_oauth_clients, set sub-values to "".
sed -i 's|"<TODO[^"]*"|""|g' "$VAULT_YML"
ok "remaining placeholders blanked out"
else
die "edit $VAULT_YML manually then rerun PHASE=2 ./bootstrap-local.sh"
fi
fi
fi
if [[ ! -f "$VAULT_PASS" ]]; then
local pw=""
prompt_password pw "choose a vault password (memorize it !)"
echo "$pw" > "$VAULT_PASS"
chmod 0400 "$VAULT_PASS"
ok "wrote $VAULT_PASS"
# If vault.yml is plaintext, encrypt now.
if ! head -1 "$VAULT_YML" | grep -q '^\$ANSIBLE_VAULT'; then
info "encrypting vault.yml"
ansible-vault encrypt --vault-password-file "$VAULT_PASS" "$VAULT_YML"
ok "encrypted"
fi
fi
info "verifying we can decrypt"
if ! ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null 2>&1; then
TALAS_HINT="if you remember the password, edit $VAULT_PASS to match. Otherwise run scripts/bootstrap/reset-vault.sh to start over."
die "cannot decrypt $VAULT_YML with $VAULT_PASS — password mismatch"
fi
ok "vault decryption verified"
mark_done vault
phase vault DONE
}
# ============================================================================
# Phase 3 — Forgejo Secrets + Variables
# ============================================================================
phase_3_forgejo() {
section "Phase 3 — Forgejo Secrets + Variables"
_current_phase=forgejo
phase forgejo START
if skip_if_done forgejo "Forgejo provisioning"; then
phase forgejo DONE; return 0
fi
require_env FORGEJO_ADMIN_TOKEN \
"create at $FORGEJO_API_URL/-/user/settings/applications (scopes: write:repository + write:package, optionally write:admin to auto-create registry tokens)"
local insecure=()
[[ "${FORGEJO_INSECURE:-0}" == "1" ]] && insecure=(-k)
info "checking Forgejo API reachability (no-auth /version probe)"
if ! curl -fsSL "${insecure[@]}" --max-time 10 \
"$FORGEJO_API_URL/api/v1/version" >/dev/null 2>&1; then
TALAS_HINT="check FORGEJO_API_URL ($FORGEJO_API_URL) ; for self-signed certs set FORGEJO_INSECURE=1 in .env ; verify WireGuard if URL is on the LAN"
die "Forgejo API unreachable"
fi
ok "Forgejo API reachable"
info "checking repo $FORGEJO_OWNER/$FORGEJO_REPO + token has write access"
if ! forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO" >/dev/null 2>&1; then
TALAS_HINT="verify FORGEJO_OWNER + FORGEJO_REPO (currently $FORGEJO_OWNER/$FORGEJO_REPO) ; verify token scope includes read:repository"
die "repo $FORGEJO_OWNER/$FORGEJO_REPO not found or token lacks read:repository"
fi
ok "repo + token OK"
# FORGEJO_REGISTRY_TOKEN — set once, then leave alone. Re-runs of
# phase 3 don't re-prompt unless the secret has been deleted in
# Forgejo UI, OR the operator sets FORCE_FORGEJO_REPROMPT=1.
# NB: Forgejo doesn't expose GET /actions/secrets/<name> — we list
# all secrets and grep by name.
local _secret_exists=0
if forgejo_api GET "/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets" 2>/dev/null \
| jq -e '.[]? | select(.name == "FORGEJO_REGISTRY_TOKEN")' >/dev/null 2>&1; then
_secret_exists=1
fi
if [[ "${FORCE_FORGEJO_REPROMPT:-0}" != "1" ]] && (( _secret_exists == 1 )); then
ok "secret FORGEJO_REGISTRY_TOKEN already set (set FORCE_FORGEJO_REPROMPT=1 to replace)"
else
local registry_token=""
if [[ -n "${FORGEJO_REGISTRY_TOKEN:-}" ]]; then
info "using FORGEJO_REGISTRY_TOKEN from environment"
registry_token="$FORGEJO_REGISTRY_TOKEN"
else
info "trying to auto-create a registry token (needs write:admin scope on admin token)"
local resp
resp=$(forgejo_api POST "/users/$FORGEJO_OWNER/tokens" \
--data "$(jq -nc --arg n "veza-deploy-registry-$(date +%s)" \
--argjson s '["write:package", "read:package"]' \
'{name: $n, scopes: $s}')" 2>/dev/null \
|| true)
registry_token=$(echo "$resp" | jq -r '.sha1 // empty' 2>/dev/null || true)
if [[ -z "$registry_token" ]]; then
warn "auto-create failed (admin token lacks write:admin or sudo)"
warn "create the token manually :"
warn " $FORGEJO_API_URL/-/user/settings/applications"
warn " → 'Generate New Token' → name 'veza-deploy-registry'"
warn " → scopes: write:package, read:package"
prompt_password registry_token "paste the token value (input hidden)"
else
ok "auto-created registry token (${#registry_token} chars)"
fi
fi
forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_TOKEN "$registry_token"
fi
# Vault password is always re-set from the current .vault-pass — cheap,
# idempotent, and survives a re-run after rotation.
forgejo_set_secret "$FORGEJO_OWNER" "$FORGEJO_REPO" ANSIBLE_VAULT_PASSWORD "$(cat "$VAULT_PASS")"
forgejo_set_var "$FORGEJO_OWNER" "$FORGEJO_REPO" FORGEJO_REGISTRY_URL \
"$FORGEJO_API_URL/api/packages/$FORGEJO_OWNER/generic"
mark_done forgejo
phase forgejo DONE
}
# ============================================================================
# Phase 4 — R720 remote bootstrap
# ============================================================================
phase_4_r720() {
section "Phase 4 — R720 remote bootstrap (Incus profiles + runner labels)"
_current_phase=r720
phase r720 START
if skip_if_done r720 "R720 remote bootstrap"; then
phase r720 DONE; return 0
fi
require_env FORGEJO_ADMIN_TOKEN
info "fetching a runner registration token from Forgejo"
local reg_token
reg_token=$(forgejo_get_runner_token "$FORGEJO_OWNER" "$FORGEJO_REPO") \
|| die "could not fetch runner registration token"
info "got registration token (${#reg_token} chars)"
local remote_script="$SCRIPT_DIR/bootstrap-remote.sh"
local remote_lib="$SCRIPT_DIR/lib.sh"
require_file "$remote_script"
require_file "$remote_lib"
# SSH target string handles both "user@host" and pure "host" (when
# the alias's User= line is the source of truth).
local ssh_target
if [[ -n "${R720_USER:-}" ]]; then
ssh_target="${R720_USER}@${R720_HOST}"
else
ssh_target="${R720_HOST}"
fi
info "uploading lib.sh + bootstrap-remote.sh to $ssh_target:/tmp/talas-bootstrap/"
ssh "$ssh_target" "mkdir -p /tmp/talas-bootstrap" \
|| die "ssh mkdir failed (target: $ssh_target)"
scp -q "$remote_lib" "$remote_script" "$ssh_target:/tmp/talas-bootstrap/" \
|| die "scp failed (target: $ssh_target)"
ok "uploaded"
info "running bootstrap-remote.sh over ssh -t (TTY for sudo prompt)"
info " → if sudo asks for a password, type it once at the prompt below"
# ssh -t allocates a TTY so sudo can prompt for the password. Set vars
# via env=… so they're available inside the sudo'd script (sudo -E
# only preserves explicit pre-existing env vars, not ones set on the
# ssh command line). The /var/log/talas-bootstrap.log on R720 keeps
# a copy of the output even if the SSH stream gets cut.
if ! ssh -t "$ssh_target" \
"sudo env FORGEJO_REGISTRATION_TOKEN='$reg_token' \
FORGEJO_API_URL='$FORGEJO_API_URL' \
bash /tmp/talas-bootstrap/bootstrap-remote.sh"; then
TALAS_HINT="ssh to $ssh_target and tail /var/log/talas-bootstrap.log ; or set up passwordless sudo : echo '$R720_USER ALL=(ALL) NOPASSWD: /usr/bin/bash' | sudo tee /etc/sudoers.d/talas-bootstrap"
die "remote bootstrap failed"
fi
# Cleanup uploaded scripts.
ssh "$ssh_target" "rm -rf /tmp/talas-bootstrap" || true
mark_done r720
phase r720 DONE
}
# ============================================================================
# Phase 5 — Edge HAProxy + Let's Encrypt
# ============================================================================
phase_5_haproxy() {
section "Phase 5 — Edge HAProxy + Let's Encrypt certs"
_current_phase=haproxy
phase haproxy START
if skip_if_done haproxy "haproxy + LE"; then
phase haproxy DONE; return 0
fi
cd "$REPO_ROOT/infra/ansible"
# Ansible collections needed by the haproxy/deploy playbooks.
# ansible.cfg sets stdout_callback=yaml which lives in
# community.general — without it, ansible-playbook errors out
# immediately ("Invalid callback for stdout specified: yaml").
info "ensuring ansible collections (community.general / .postgresql / .rabbitmq) are installed"
for col in community.general community.postgresql community.rabbitmq; do
if ! ansible-galaxy collection list "$col" 2>/dev/null | grep -q "^$col"; then
info "installing $col"
ansible-galaxy collection install "$col" >/dev/null \
|| die "ansible-galaxy collection install $col failed (network ? ~/.ansible/ writable ?)"
fi
done
ok "collections present"
# Compute SSH target the same way phase 4 does.
local ssh_target
if [[ -n "${R720_USER:-}" ]]; then
ssh_target="${R720_USER}@${R720_HOST}"
else
ssh_target="${R720_HOST}"
fi
# Detect if NOPASSWD sudo is configured ; if not, pass --ask-become-pass.
local become_flag=()
if ssh "$ssh_target" "sudo -n /bin/true" >/dev/null 2>&1; then
ok "passwordless sudo on R720 — running ansible without -K"
else
info "sudo on R720 needs a password — passing --ask-become-pass"
info " → ansible will prompt 'BECOME password:' below ; type your sudo password"
become_flag=(--ask-become-pass)
fi
info "running ansible-playbook playbooks/haproxy.yml (510 min)"
if ! ansible-playbook -i inventory/staging.yml playbooks/haproxy.yml \
--vault-password-file .vault-pass \
"${become_flag[@]}"; then
TALAS_HINT="check the ansible output above ; common issues : Incus profile missing, port 80 blocked from Internet, DNS not yet propagated, sudo password rejected"
die "ansible-playbook haproxy.yml failed"
fi
info "verifying Let's Encrypt certs landed"
local certs
certs=$(ssh "$R720_USER@$R720_HOST" "incus exec veza-haproxy -- ls /usr/local/etc/tls/haproxy/ 2>/dev/null" || true)
if [[ -z "$certs" ]]; then
warn "no certs found in /usr/local/etc/tls/haproxy/ on veza-haproxy"
warn "check /var/log/letsencrypt or run again — dehydrated retries on next playbook run"
return 1
fi
ok "certs : $(echo "$certs" | tr '\n' ' ')"
mark_done haproxy
phase haproxy DONE
}
# ============================================================================
# Phase 6 — Summary
# ============================================================================
phase_6_summary() {
section "Phase 6 — Summary"
_current_phase=summary
phase summary START
cat <<EOF >&2
${_GREEN}${_BOLD}✓ Bootstrap complete.${_RESET}
What works now :
• Forgejo registry has the deploy secrets + variable.
• forgejo-runner has the 'incus' label and Incus socket access.
• veza-haproxy edge container is up with Let's Encrypt certs.
What you can do next :
1. Trigger a manual deploy via Forgejo Actions UI :
$FORGEJO_API_URL/$FORGEJO_OWNER/$FORGEJO_REPO/actions
→ "Veza deploy" → "Run workflow" → env=staging.
2. Once that run is green, re-enable auto-trigger :
$SCRIPT_DIR/enable-auto-deploy.sh
3. Verify state any time :
$SCRIPT_DIR/verify-local.sh
ssh $R720_USER@$R720_HOST $SCRIPT_DIR/verify-remote.sh
State file : $TALAS_STATE_FILE
EOF
mark_done summary
phase summary DONE
}
# ============================================================================
# main
# ============================================================================
main() {
local start=${PHASE:-1}
info "starting at phase $start"
[[ $start -le 1 ]] && phase_1_preflight
[[ $start -le 2 ]] && phase_2_vault
[[ $start -le 3 ]] && phase_3_forgejo
[[ $start -le 4 ]] && phase_4_r720
[[ $start -le 5 ]] && phase_5_haproxy
[[ $start -le 6 ]] && phase_6_summary
ok "ALL DONE"
}
main "$@"