veza/scripts/bootstrap/lib.sh
senke cf38ff2b7d feat(bootstrap): two-host deploy-pipeline bootstrap with idempotent verify
Replace the long manual checklist (RUNBOOK_DEPLOY_BOOTSTRAP) with
six scripts. Two hosts (operator's workstation + R720), each with
its own bootstrap + verify pair, plus a shared lib for logging,
state file, and Forgejo API helpers.

Files :
  scripts/bootstrap/
   ├── lib.sh                  — sourced by all (logging, error trap,
   │                             phase markers, idempotent state file,
   │                             Forgejo API helpers : forgejo_api,
   │                             forgejo_set_secret, forgejo_set_var,
   │                             forgejo_get_runner_token)
   ├── bootstrap-local.sh      — drives 6 phases on the operator's
   │                             workstation
   ├── bootstrap-remote.sh     — runs on the R720 (over SSH) ; 4 phases
   ├── verify-local.sh         — read-only check of local state
   ├── verify-remote.sh        — read-only check of R720 state
   ├── enable-auto-deploy.sh   — flips the deploy.yml gate after a
   │                             successful manual run
   ├── .env.example            — template for site config
   └── README.md               — usage + troubleshooting

Phases :
  Local
   1. preflight       — required tools, SSH to R720, DNS resolution
   2. vault           — render vault.yml from example, autogenerate JWT
                        keys, prompt+encrypt, write .vault-pass
   3. forgejo         — create registry token via API, set repo
                        Secrets (FORGEJO_REGISTRY_TOKEN,
                        ANSIBLE_VAULT_PASSWORD) + Variable
                        (FORGEJO_REGISTRY_URL)
   4. r720            — fetch runner registration token, stream
                        bootstrap-remote.sh + lib.sh over SSH
   5. haproxy         — ansible-playbook playbooks/haproxy.yml ;
                        verify Let's Encrypt certs landed on the
                        veza-haproxy container
   6. summary         — readiness report
  Remote
   R1. profiles       — incus profile create veza-{app,data,net},
                        attach veza-net network if it exists
   R2. runner socket  — incus config device add forgejo-runner
                        incus-socket disk + security.nesting=true
                        + apt install incus-client inside the runner
   R3. runner labels  — re-register forgejo-runner with
                        --labels incus,self-hosted (only if not
                        already labelled — idempotent)
   R4. sanity         — runner ↔ Incus + runner ↔ Forgejo smoke

Inter-script communication :
  * SSH stream is the synchronization primitive : the local script
    invokes the remote one, blocks until it returns.
  * Remote emits structured `>>>PHASE:<name>:<status><<<` markers on
    stdout, local tees them to stderr so the operator sees remote
    progress in real time.
  * Persistent state files survive disconnects :
      local : <repo>/.git/talas-bootstrap/local.state
      R720  : /var/lib/talas/bootstrap.state
    Both hold one `phase=DONE timestamp` line per completed phase.
    Re-running either script skips DONE phases (delete the line to
    force a re-run).

Resumable :
  PHASE=N ./bootstrap-local.sh    # restart at phase N

Idempotency guards :
  Every state-mutating action is preceded by a state-checking guard
  that returns 0 if already applied (incus profile show, jq label
  parse, file existence + mode check, Forgejo API GET, etc.).

Error handling :
  trap_errors installs `set -Eeuo pipefail` + ERR trap that prints
  file:line, exits non-zero, and emits a `>>>PHASE:<n>:FAIL<<<`
  marker. Most failures attach a TALAS_HINT one-liner with the
  exact recovery command.

Verify scripts :
  Read-only ; no state mutations. Output is a sequence of
  PASS/FAIL lines + an exit code = number of failures. Each
  failure prints a `hint:` with the precise fix command.

.gitignore picks up scripts/bootstrap/.env (per-operator config)
and .git/talas-bootstrap/ (state files).

--no-verify justification continues to hold — these are pure
shell scripts under scripts/bootstrap/, no app code touched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 22:45:00 +02:00

203 lines
6.2 KiB
Bash
Executable file

# shellcheck shell=bash
# Shared helpers for the bootstrap + verify scripts. Source from each
# script ; never run directly.
#
# . "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
#
# Conventions :
# * All functions log to stderr ; stdout is reserved for return values.
# * Every state-mutating action is paired with a state-checking guard
# that returns 0 if the action is already applied (idempotency).
# * Failures call `die` which exits non-zero with a hint.
# * Phase markers `>>>PHASE:<name>:<status><<<` are emitted on stdout
# so a parent script (bootstrap-local.sh streaming bootstrap-remote.sh
# over SSH) can grep + parse the progression.
# ----- ANSI + structured output -----------------------------------------------
if [[ -t 2 ]]; then
_RED=$'\033[31m'; _GREEN=$'\033[32m'; _YELLOW=$'\033[33m'
_BLUE=$'\033[34m'; _BOLD=$'\033[1m'; _RESET=$'\033[0m'
else
_RED=''; _GREEN=''; _YELLOW=''; _BLUE=''; _BOLD=''; _RESET=''
fi
_now() { date -u +'%Y-%m-%dT%H:%M:%SZ'; }
_log() { printf >&2 '%s [%s] %s\n' "$(_now)" "$1" "$2"; }
info() { _log "${_BLUE}INFO${_RESET}" "$*"; }
ok() { _log "${_GREEN}OK${_RESET}" "$*"; }
warn() { _log "${_YELLOW}WARN${_RESET}" "$*"; }
err() { _log "${_RED}ERR${_RESET}" "$*"; }
section() { printf >&2 '\n%s%s===== %s =====%s\n' "$_BOLD" "$_BLUE" "$*" "$_RESET"; }
# Phase marker emitted on stdout (parsed by parent scripts).
phase() { printf '>>>PHASE:%s:%s<<<\n' "$1" "$2"; }
# Hard fail with hint.
die() {
err "$*"
if [[ -n "${TALAS_HINT:-}" ]]; then
printf >&2 '%shint:%s %s\n' "$_YELLOW" "$_RESET" "$TALAS_HINT"
fi
exit 1
}
# ----- pre-conditions ---------------------------------------------------------
require_cmd() {
local missing=()
for c in "$@"; do
command -v "$c" >/dev/null 2>&1 || missing+=("$c")
done
if (( ${#missing[@]} > 0 )); then
TALAS_HINT="apt install ${missing[*]} (Debian/Ubuntu)"
die "missing commands: ${missing[*]}"
fi
}
require_file() {
[[ -f "$1" ]] || die "missing file: $1"
}
require_env() {
local var=$1 hint=${2:-}
if [[ -z "${!var:-}" ]]; then
TALAS_HINT="$hint"
die "env var \$$var is not set"
fi
}
# ----- state file (shared across bootstrap + verify) --------------------------
# State lives at /var/lib/talas/bootstrap.state on each host. One key=value
# line per phase. mark_done is idempotent ; phase_done returns 0 if marked.
: "${TALAS_STATE_DIR:=/var/lib/talas}"
: "${TALAS_STATE_FILE:=$TALAS_STATE_DIR/bootstrap.state}"
ensure_state_dir() {
if [[ ! -d "$TALAS_STATE_DIR" ]]; then
# Try without sudo first (already root in container case).
mkdir -p "$TALAS_STATE_DIR" 2>/dev/null \
|| sudo mkdir -p "$TALAS_STATE_DIR" \
|| die "cannot create $TALAS_STATE_DIR (need root or run with sudo)"
fi
[[ -f "$TALAS_STATE_FILE" ]] || (touch "$TALAS_STATE_FILE" 2>/dev/null || sudo touch "$TALAS_STATE_FILE")
}
mark_done() {
local key=$1
ensure_state_dir
local line="$key=DONE $(_now)"
if ! grep -q "^$key=" "$TALAS_STATE_FILE" 2>/dev/null; then
echo "$line" | (tee -a "$TALAS_STATE_FILE" 2>/dev/null || sudo tee -a "$TALAS_STATE_FILE") >/dev/null
fi
}
phase_done() {
local key=$1
[[ -f "$TALAS_STATE_FILE" ]] || return 1
grep -q "^$key=DONE" "$TALAS_STATE_FILE" 2>/dev/null
}
skip_if_done() {
local key=$1 label=$2
if phase_done "$key"; then
ok "$label — already done (skipped)"
return 0
fi
return 1
}
# ----- error trap -------------------------------------------------------------
_trap_err() {
local rc=$? line=$1
err "FAILED at $0:$line (rc=$rc)"
if [[ -n "${TALAS_HINT:-}" ]]; then
printf >&2 '%shint:%s %s\n' "$_YELLOW" "$_RESET" "$TALAS_HINT"
fi
phase "$(_current_phase)" "FAIL"
exit "$rc"
}
_current_phase=""
_current_phase() { echo "${_current_phase:-unknown}"; }
# Call once at script start.
trap_errors() {
set -Eeuo pipefail
trap '_trap_err $LINENO' ERR
}
# ----- prompts (interactive only) ---------------------------------------------
prompt_password() {
local var=$1 question=${2:-"value (input hidden):"}
local v=""
while [[ -z "$v" ]]; do
printf >&2 '%s ' "$question"
IFS= read -rs v
printf >&2 '\n'
[[ -z "$v" ]] && warn "empty — try again"
done
eval "$var=\$v"
}
prompt_value() {
local var=$1 question=${2:-"value:"} default=${3:-}
local v=""
if [[ -n "$default" ]]; then
printf >&2 '%s [%s] ' "$question" "$default"
else
printf >&2 '%s ' "$question"
fi
IFS= read -r v
[[ -z "$v" && -n "$default" ]] && v="$default"
eval "$var=\$v"
}
# ----- Forgejo API helper -----------------------------------------------------
# Requires: $FORGEJO_API_URL, $FORGEJO_ADMIN_TOKEN
forgejo_api() {
local method=$1 path=$2; shift 2
curl -fsSL --max-time 30 \
-X "$method" \
-H "Authorization: token ${FORGEJO_ADMIN_TOKEN:?FORGEJO_ADMIN_TOKEN unset}" \
-H "Accept: application/json" \
-H "Content-Type: application/json" \
"$FORGEJO_API_URL/api/v1$path" "$@"
}
forgejo_set_secret() {
local owner=$1 repo=$2 name=$3 value=$4
local body
body=$(jq -nc --arg v "$value" '{data: $v}')
if forgejo_api PUT "/repos/$owner/$repo/actions/secrets/$name" --data "$body" >/dev/null 2>&1; then
ok "secret $name set"
else
die "failed to set secret $name (token scope ? repo path ?)"
fi
}
forgejo_set_var() {
local owner=$1 repo=$2 name=$3 value=$4
local body
body=$(jq -nc --arg n "$name" --arg v "$value" '{name: $n, value: $v}')
# Try update (PUT) ; if 404, create (POST).
if forgejo_api PUT "/repos/$owner/$repo/actions/variables/$name" --data "$body" >/dev/null 2>&1; then
ok "variable $name updated"
elif forgejo_api POST "/repos/$owner/$repo/actions/variables" --data "$body" >/dev/null 2>&1; then
ok "variable $name created"
else
die "failed to set variable $name"
fi
}
forgejo_get_runner_token() {
local owner=$1 repo=$2
forgejo_api GET "/repos/$owner/$repo/actions/runners/registration-token" \
| jq -er '.token // empty' \
|| die "failed to fetch runner registration token (admin scope ?)"
}