diff --git a/scripts/bootstrap/.env.example b/scripts/bootstrap/.env.example index d61b8792f..f2f616e9b 100644 --- a/scripts/bootstrap/.env.example +++ b/scripts/bootstrap/.env.example @@ -2,18 +2,36 @@ # pick it up automatically. # # cp .env.example .env -# $EDITOR .env +# vim .env # NB: $EDITOR is unset by default in many shells +# ↑ use the editor name directly -R720_HOST=10.0.20.150 -R720_USER=ansible +# ---- R720 SSH target --------------------------------------------------------- +# If you use an SSH config Host alias (e.g. `srv-102v` in ~/.ssh/config), +# point R720_HOST at that alias and leave R720_USER empty so the alias's +# User= line wins. +R720_HOST=srv-102v +R720_USER=senke -FORGEJO_API_URL=https://forgejo.talas.group -FORGEJO_OWNER=talas +# ---- Forgejo API (for secret + variable provisioning) ------------------------ +# First-run, before HAProxy + LE certs are up : use the LAN IP on port 3000 +# directly. Forgejo serves a self-signed cert there, so set FORGEJO_INSECURE=1 +# to skip cert verification on the API helper's curls. +FORGEJO_API_URL=https://10.0.20.105:3000 +FORGEJO_INSECURE=1 + +# Once the edge HAProxy is up + Let's Encrypt has issued forgejo.talas.group : +# FORGEJO_API_URL=https://forgejo.talas.group +# FORGEJO_INSECURE=0 + +# Owner = the path segment between forgejo.talas.group/ and /veza in the URL +# of your repo. Run `git remote -v` to confirm — usually `senke` (user) or +# `talas` (org). +FORGEJO_OWNER=senke FORGEJO_REPO=veza # Forgejo personal access token with scopes : -# write:admin (for runner registration token) -# write:repository (for repo secrets/variables) -# write:package (for the registry token created on the fly) +# write:admin — for runner registration token +# write:repository — for repo secrets/variables +# write:package — for the registry token created on the fly # Generate at $FORGEJO_API_URL/-/user/settings/applications FORGEJO_ADMIN_TOKEN= diff --git a/scripts/bootstrap/README.md b/scripts/bootstrap/README.md index 10ebbcf21..2d8ed359f 100644 --- a/scripts/bootstrap/README.md +++ b/scripts/bootstrap/README.md @@ -8,12 +8,14 @@ asked to mutate. | File | Where it runs | What it does | |---|---|---| -| `lib.sh` | sourced by both | logging, error trap, idempotent state file, Forgejo API helpers | +| `lib.sh` | sourced by all | logging, error trap, idempotent state file, Forgejo API helpers (honours `FORGEJO_INSECURE=1`) | | `bootstrap-local.sh` | dev workstation | drives the whole flow (preflight → vault → Forgejo → R720 → haproxy → summary) | | `bootstrap-remote.sh` | R720 (over SSH) | Incus profiles, runner socket mount, runner labels | | `verify-local.sh` | dev workstation | read-only checks of local state | -| `verify-remote.sh` | R720 | read-only checks of R720 state | -| `enable-auto-deploy.sh` | dev workstation | flips the deploy.yml gate from workflow_dispatch-only to push:main + tag:v* | +| `verify-remote.sh` | R720 | read-only checks of R720 state (run via `verify-remote-ssh.sh`) | +| `verify-remote-ssh.sh` | dev workstation | scp+ssh wrapper that runs `verify-remote.sh` on R720 | +| `enable-auto-deploy.sh` | dev workstation | restores `.forgejo/workflows/` if disabled, uncomments push: trigger | +| `reset-vault.sh` | dev workstation | recovery from a vault password mismatch (destructive — re-prompts) | | `.env.example` | template | copy to `.env`, fill in, gitignored | ## State file @@ -78,20 +80,43 @@ ssh ansible@10.0.20.150 'sudo bash' < verify-remote.sh ## Troubleshooting +- **Phase 1 SSH fails** — verify `R720_HOST` + `R720_USER` in `.env`. + If you use an SSH config alias (e.g. `Host srv-102v` in + `~/.ssh/config`), set `R720_HOST=srv-102v` and either set + `R720_USER=` (empty, alias's User= wins) or match the alias's user. + Test manually : `ssh ${R720_USER}@${R720_HOST} /bin/true`. +- **Phase 2 `cannot decrypt vault.yml`** — the password in + `.vault-pass` doesn't match what was used to encrypt `vault.yml`. + - If you remember the original password, edit `.vault-pass` + (`echo "" > infra/ansible/.vault-pass ; chmod 0400 …`). + - Otherwise : `./reset-vault.sh` — destructive, re-prompts for + everything. +- **Phase 3 `Forgejo API unreachable`** — Forgejo on + `https://10.0.20.105:3000` serves a self-signed cert. Set + `FORGEJO_INSECURE=1` in `.env`. Once the edge HAProxy is up + LE has + issued `forgejo.talas.group`, switch to that URL and clear + `FORGEJO_INSECURE`. - **Phase 3 `repo not found`** — set `FORGEJO_OWNER` to the actual - org/user owning the repo (e.g., `senke` instead of `talas`). -- **Phase 4 SSH timeout** — `sudo` may prompt for password ; configure - passwordless sudo for the SSH user, OR run remote bootstrap manually : + org/user owning the repo. Confirm with `git remote -v` (the path + segment after `host:port/`). +- **Phase 4 SSH timeout / sudo prompt** — passwordless sudo needed + for the SSH user. Add to `/etc/sudoers.d/talas-bootstrap` : ``` - scp scripts/bootstrap/{lib.sh,bootstrap-remote.sh} r720:/tmp/ - ssh r720 'sudo FORGEJO_REGISTRATION_TOKEN=… bash /tmp/bootstrap-remote.sh' + senke ALL=(ALL) NOPASSWD: /usr/bin/bash ``` -- **Phase 5 dehydrated fails** — check that port 80 reaches the R720 - from Internet (not blocked by ISP, NAT-forwarded, etc.). dehydrated - needs HTTP-01 inbound. Test: from outside, - `curl http://veza.fr/.well-known/acme-challenge/test` should hit - HAProxy's letsencrypt_backend (will 404, which is fine ; what - matters is it reaches the R720). + Or run the remote half manually : + ``` + scp scripts/bootstrap/{lib.sh,bootstrap-remote.sh} srv-102v:/tmp/ + ssh srv-102v 'sudo FORGEJO_REGISTRATION_TOKEN= bash /tmp/bootstrap-remote.sh' + ``` +- **Phase 5 dehydrated fails** — port 80 must be reachable from + Internet for HTTP-01 (not blocked by ISP, NAT-forwarded). Test + from outside : `curl http://veza.fr/.well-known/acme-challenge/test` + should hit HAProxy's `letsencrypt_backend` (will 404, which is + fine ; what matters is reaching the R720). +- **`.forgejo/workflows/` is missing, only `workflows.disabled/` present** — + expected when the auto-trigger has been gated by renaming the dir. + `enable-auto-deploy.sh` restores it. ## After bootstrap diff --git a/scripts/bootstrap/bootstrap-local.sh b/scripts/bootstrap/bootstrap-local.sh index 42cf103b2..2da566d7f 100755 --- a/scripts/bootstrap/bootstrap-local.sh +++ b/scripts/bootstrap/bootstrap-local.sh @@ -154,7 +154,8 @@ phase_2_vault() { info "verifying we can decrypt" if ! ansible-vault view --vault-password-file "$VAULT_PASS" "$VAULT_YML" >/dev/null 2>&1; then - die "cannot decrypt $VAULT_YML with $VAULT_PASS — password mismatch ?" + TALAS_HINT="if you remember the password, edit $VAULT_PASS to match. Otherwise run scripts/bootstrap/reset-vault.sh to start over." + die "cannot decrypt $VAULT_YML with $VAULT_PASS — password mismatch" fi ok "vault decryption verified" diff --git a/scripts/bootstrap/enable-auto-deploy.sh b/scripts/bootstrap/enable-auto-deploy.sh index 58a67bc42..cad851425 100755 --- a/scripts/bootstrap/enable-auto-deploy.sh +++ b/scripts/bootstrap/enable-auto-deploy.sh @@ -1,8 +1,15 @@ #!/usr/bin/env bash -# enable-auto-deploy.sh — flip the workflow_dispatch-only gate on -# .forgejo/workflows/deploy.yml back to push:main + tag:v*. Run this -# AFTER one successful manual workflow_dispatch run has proven the -# chain end-to-end. +# enable-auto-deploy.sh — re-enable Forgejo Actions deploy workflow. +# +# Two scenarios : +# A. .forgejo/workflows.disabled/ exists (current state on this branch) +# → rename back to .forgejo/workflows/, then ensure deploy.yml's +# push: trigger is uncommented. +# B. .forgejo/workflows/deploy.yml exists with push: commented out +# → just uncomment. +# +# Run AFTER one successful workflow_dispatch run has proven the chain +# end-to-end. set -Eeuo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -10,43 +17,55 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" trap_errors REPO_ROOT=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel) || die "not in a git repo" -DEPLOY_YML="$REPO_ROOT/.forgejo/workflows/deploy.yml" + +WF_DIR="$REPO_ROOT/.forgejo/workflows" +WF_DISABLED="$REPO_ROOT/.forgejo/workflows.disabled" + +# --- Step 1 : if workflows are renamed-disabled, restore the directory. ------- +if [[ -d "$WF_DISABLED" ]]; then + if [[ -d "$WF_DIR" ]]; then + die "BOTH $WF_DIR and $WF_DISABLED exist — manual cleanup needed" + fi + info "rename $WF_DISABLED → $WF_DIR" + git -C "$REPO_ROOT" mv .forgejo/workflows.disabled .forgejo/workflows + ok "directory restored" +fi + +DEPLOY_YML="$WF_DIR/deploy.yml" require_file "$DEPLOY_YML" +# --- Step 2 : if push: trigger is commented, uncomment it. -------------------- if grep -qE '^[[:space:]]+push:$' "$DEPLOY_YML"; then - ok "auto-deploy already enabled" - exit 0 + ok "auto-deploy trigger already active in deploy.yml" +else + if ! grep -qE '^[[:space:]]+# push:' "$DEPLOY_YML"; then + die "deploy.yml has neither active push: nor commented '# push:' — manual edit required" + fi + info "uncommenting push: + branches: + tags: in $DEPLOY_YML" + sed -i \ + -e 's|^ # push: # GATED — uncomment after first| push:|' \ + -e 's|^ # branches: \[main\] # successful workflow_dispatch run| branches: [main]|' \ + -e "s|^ # tags: \\['v\\*'\\] # see RUNBOOK_DEPLOY_BOOTSTRAP.md| tags: ['v*']|" \ + "$DEPLOY_YML" + + if ! grep -qE '^[[:space:]]+push:$' "$DEPLOY_YML"; then + die "sed didn't apply — open $DEPLOY_YML and uncomment by hand" + fi + ok "trigger uncommented" fi -if ! grep -qE '^[[:space:]]+# push:' "$DEPLOY_YML"; then - die "deploy.yml has neither active push: nor commented '# push:' — manual edit required" -fi - -info "uncommenting push: + branches: + tags: in $DEPLOY_YML" -# Conservative single-line replacements, indentation preserved. -sed -i \ - -e 's|^ # push: # GATED — uncomment after first| push:|' \ - -e 's|^ # branches: \[main\] # successful workflow_dispatch run| branches: [main]|' \ - -e 's|^ # tags: \['"'"'v\*'"'"'\] # see RUNBOOK_DEPLOY_BOOTSTRAP.md| tags: ['"'"'v*'"'"']|' \ - "$DEPLOY_YML" - -# Verify. -if ! grep -qE '^[[:space:]]+push:$' "$DEPLOY_YML"; then - die "sed didn't apply — open $DEPLOY_YML and uncomment by hand" -fi - -ok "edited $DEPLOY_YML" +# --- Step 3 : prompt to commit + push. ---------------------------------------- info "diff:" -git -C "$REPO_ROOT" --no-pager diff -- "$DEPLOY_YML" >&2 +git -C "$REPO_ROOT" --no-pager diff -- "$WF_DIR" >&2 || true cat >&2 <&2 + Next step : + cd $SCRIPT_DIR + PHASE=2 ./bootstrap-local.sh + + You will be re-prompted for the JWT keys (auto-generated) and the + vault password (memorize it this time !). +EOF diff --git a/scripts/bootstrap/verify-local.sh b/scripts/bootstrap/verify-local.sh index 5b633f076..e9583c7d1 100755 --- a/scripts/bootstrap/verify-local.sh +++ b/scripts/bootstrap/verify-local.sh @@ -61,10 +61,25 @@ check "dig available" "command -v dig" section "Repo state" check "in repo root" "[[ -f $REPO_ROOT/CLAUDE.md ]]" check "infra/ansible/ exists" "[[ -d $REPO_ROOT/infra/ansible ]]" -check ".forgejo/workflows/deploy.yml" "[[ -f $REPO_ROOT/.forgejo/workflows/deploy.yml ]]" -check_with_hint "deploy.yml gated (no auto-trigger)" \ - "! grep -E '^[[:space:]]+push:$' $REPO_ROOT/.forgejo/workflows/deploy.yml" \ - "if you want auto-deploy, run scripts/bootstrap/enable-auto-deploy.sh" + +# .forgejo/workflows/ may be active OR renamed to .disabled/ — both are +# valid states. Active = auto-trigger may fire ; disabled = manual run +# only via re-enable script. +if [[ -d "$REPO_ROOT/.forgejo/workflows.disabled" ]]; then + check "deploy.yml present (under workflows.disabled/)" \ + "[[ -f $REPO_ROOT/.forgejo/workflows.disabled/deploy.yml ]]" + info " → workflows are DISABLED (renamed to workflows.disabled/) ;" + info " re-enable with scripts/bootstrap/enable-auto-deploy.sh" +elif [[ -d "$REPO_ROOT/.forgejo/workflows" ]]; then + check "deploy.yml present" \ + "[[ -f $REPO_ROOT/.forgejo/workflows/deploy.yml ]]" + check_with_hint "deploy.yml gated (no auto-trigger)" \ + "! grep -E '^[[:space:]]+push:$' $REPO_ROOT/.forgejo/workflows/deploy.yml" \ + "if you want auto-deploy, run scripts/bootstrap/enable-auto-deploy.sh" +else + err "neither .forgejo/workflows/ nor .forgejo/workflows.disabled/ found" + FAIL+=1 +fi section "Vault" check "vault.yml.example exists" "[[ -f $REPO_ROOT/infra/ansible/group_vars/all/vault.yml.example ]]" @@ -101,21 +116,25 @@ done if [[ -n "${FORGEJO_ADMIN_TOKEN:-}" ]]; then section "Forgejo API + secrets/vars" + # Reuse the lib's API helper which honours FORGEJO_INSECURE=1. + _CURL_OPTS=() + [[ "${FORGEJO_INSECURE:-0}" == "1" ]] && _CURL_OPTS+=(-k) + check_with_hint "Forgejo API reachable" \ - "curl -fsSL --max-time 10 -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/user" \ - "set FORGEJO_API_URL ; if no DNS yet, FORGEJO_API_URL=http://10.0.20.105:3000" + "curl -fsSL ${_CURL_OPTS[*]} --max-time 10 -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/user" \ + "set FORGEJO_API_URL ; for self-signed certs, set FORGEJO_INSECURE=1 in .env" check_with_hint "repo $FORGEJO_OWNER/$FORGEJO_REPO exists" \ - "curl -fsSL -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO" \ + "curl -fsSL ${_CURL_OPTS[*]} -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO" \ "set FORGEJO_OWNER + FORGEJO_REPO env vars" check_with_hint "secret FORGEJO_REGISTRY_TOKEN exists" \ - "curl -fsSL -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets/FORGEJO_REGISTRY_TOKEN" \ + "curl -fsSL ${_CURL_OPTS[*]} -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets/FORGEJO_REGISTRY_TOKEN" \ "PHASE=3 ./bootstrap-local.sh" check_with_hint "secret ANSIBLE_VAULT_PASSWORD exists" \ - "curl -fsSL -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets/ANSIBLE_VAULT_PASSWORD" \ + "curl -fsSL ${_CURL_OPTS[*]} -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/secrets/ANSIBLE_VAULT_PASSWORD" \ "PHASE=3 ./bootstrap-local.sh" check_with_hint "variable FORGEJO_REGISTRY_URL exists" \ - "curl -fsSL -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/variables/FORGEJO_REGISTRY_URL" \ + "curl -fsSL ${_CURL_OPTS[*]} -H 'Authorization: token $FORGEJO_ADMIN_TOKEN' $FORGEJO_API_URL/api/v1/repos/$FORGEJO_OWNER/$FORGEJO_REPO/actions/variables/FORGEJO_REGISTRY_URL" \ "PHASE=3 ./bootstrap-local.sh" else warn "FORGEJO_ADMIN_TOKEN not set — skipping API checks. Set it to run those." diff --git a/scripts/bootstrap/verify-remote-ssh.sh b/scripts/bootstrap/verify-remote-ssh.sh new file mode 100755 index 000000000..77c51eb2d --- /dev/null +++ b/scripts/bootstrap/verify-remote-ssh.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# verify-remote-ssh.sh — wrapper that scp's lib.sh + verify-remote.sh +# to the R720 then runs verify-remote.sh there. Saves the operator +# from having to clone the repo on the R720. +# +# Reads R720_HOST + R720_USER from .env or environment. + +set -Eeuo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +. "$SCRIPT_DIR/lib.sh" +trap_errors + +[[ -f "$SCRIPT_DIR/.env" ]] && . "$SCRIPT_DIR/.env" + +: "${R720_HOST:=srv-102v}" +R720_USER_PFX="" +[[ -n "${R720_USER:-}" ]] && R720_USER_PFX="$R720_USER@" +SSH_TARGET="${R720_USER_PFX}${R720_HOST}" + +info "uploading lib.sh + verify-remote.sh to $SSH_TARGET:/tmp/" +scp -q "$SCRIPT_DIR/lib.sh" "$SCRIPT_DIR/verify-remote.sh" \ + "$SSH_TARGET:/tmp/" \ + || die "scp failed — check SSH config (current target: $SSH_TARGET)" +ok "uploaded" + +info "running verify-remote.sh as root" +# `sudo bash` so the state file at /var/lib/talas/bootstrap.state is +# accessible. If your account has incus group access without sudo, +# drop the `sudo`. +ssh -t "$SSH_TARGET" "sudo bash /tmp/verify-remote.sh" \ + || warn "verify-remote.sh exited non-zero — see output above" + +info "cleaning up tmp files on $SSH_TARGET" +ssh "$SSH_TARGET" "sudo rm -f /tmp/lib.sh /tmp/verify-remote.sh" || true + +ok "done"