veza/infra/ansible/playbooks/bootstrap_runner.yml
senke 5f6625cc56 fix(ansible): detect storage pool from forgejo's root device, not first listed
The previous detect picked the first row of `incus storage list -f csv`,
which on the user's R720 returned `default` — but `default` is not
usable on this server (`Storage pool is unavailable on this server`
when launching). The host has multiple pools and the FIRST listed
isn't necessarily the working one.

New detect strategy (most-reliable first) :
  1. `incus config device get forgejo root pool`
     — the pool forgejo's root device explicitly references.
  2. `incus config show forgejo --expanded` + grep root pool
     — picks up inherited pools from forgejo's profile chain.
  3. Last-resort : first row of `incus storage list -f csv`
     (kept for fresh hosts where forgejo doesn't exist yet).

Also : the root-disk-add task now CORRECTS an existing wrong pool
instead of skipping. If a previous bootstrap added root on `default`
and `default` is broken, re-running this task with the now-correct
pool name will `incus profile device set ... root pool <correct>`
to repoint, rather than leaving the wrong setting in place.

Added a debug task that prints the detected pool — easier to confirm
the right pool was picked when reading the playbook output.

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 15:34:50 +02:00

302 lines
12 KiB
YAML

# bootstrap_runner.yml — one-time setup of the deploy pipeline's
# infrastructure on the Incus host. Runs BEFORE haproxy.yml /
# deploy_data.yml / deploy_app.yml the first time, then idempotent
# on every subsequent run.
#
# Phases (each idempotent, each guarded by a state check) :
# 1. Incus profiles (veza-app, veza-data ; drop legacy veza-net)
# 2. forgejo-runner has Incus socket + nesting + incus binary
# 3. forgejo-runner registered with the `incus` label
#
# Required extra-vars (passed by the wrapping bootstrap script) :
# forgejo_registration_token short-lived runner registration token
# (fetched from Forgejo API by the wrapper)
# forgejo_api_url full URL to the Forgejo instance
# (used inside the runner container)
#
# Usage from the operator's laptop :
# ansible-playbook -i inventory/staging.yml playbooks/bootstrap_runner.yml \
# --ask-become-pass \
# --vault-password-file .vault-pass \
# -e forgejo_registration_token=$TOKEN \
# -e forgejo_api_url=https://10.0.20.105:3000
#
# Usage directly on the R720 :
# ansible-playbook -i inventory/local.yml playbooks/bootstrap_runner.yml \
# --vault-password-file /etc/talas/vault-pass \
# -e forgejo_registration_token=$TOKEN \
# -e forgejo_api_url=https://10.0.20.105:3000
---
- name: Validate inputs
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Assert required extra-vars
ansible.builtin.assert:
that:
- forgejo_registration_token is defined
- forgejo_registration_token | length > 10
- forgejo_api_url is defined
- forgejo_api_url | length > 0
fail_msg: >-
bootstrap_runner.yml requires forgejo_registration_token
(fetched from $FORGEJO_API/api/v1/repos/$OWNER/$REPO/actions/runners/registration-token)
and forgejo_api_url (e.g. https://10.0.20.105:3000) ;
pass them via -e on the command line.
quiet: true
# =====================================================================
# Phase 1 — Incus profiles
# =====================================================================
- name: Phase 1 — Incus profiles
hosts: incus_hosts
become: true
gather_facts: true
tasks:
- name: Detect Incus storage pool actually used by forgejo
# Containers need a root disk device that references a storage pool.
# The host may have multiple pools, some of which are stale or
# unavailable. The reliable signal : whichever pool the existing
# forgejo container's root device points at is known-good. Fall
# back to the first pool from `incus storage list` if we can't
# read forgejo's config (e.g. fresh host without forgejo yet).
ansible.builtin.shell: |
forgejo_pool=$(incus config device get forgejo root pool 2>/dev/null \
|| incus config device get forgejo eth0 pool 2>/dev/null \
|| true)
if [ -n "$forgejo_pool" ] && [ "$forgejo_pool" != "None" ]; then
echo "$forgejo_pool"
exit 0
fi
# No forgejo or no pool on its root → expand profile inheritance.
# `incus config show forgejo --expanded` includes inherited devices.
forgejo_pool=$(incus config show forgejo --expanded 2>/dev/null \
| awk '/^ root:/{flag=1} flag && /^ pool:/{print $2; exit}' \
|| true)
if [ -n "$forgejo_pool" ]; then
echo "$forgejo_pool"
exit 0
fi
# Last resort : first pool from `incus storage list`.
incus storage list -f csv 2>/dev/null | awk -F, 'NR==1{print $1; exit}'
register: storage_pool
changed_when: false
failed_when: storage_pool.stdout | trim == ""
- name: Show detected storage pool
ansible.builtin.debug:
msg: "Storage pool : {{ storage_pool.stdout | trim }}"
- name: Ensure veza-{app,data} profiles exist
ansible.builtin.command: incus profile create {{ item }}
register: profile_create
failed_when: profile_create.rc != 0 and 'already exists' not in profile_create.stderr
changed_when: profile_create.rc == 0
loop:
- veza-app
- veza-data
- name: Ensure each profile's root disk points at pool={{ storage_pool.stdout | trim }}
# If a root device already exists but on the WRONG pool (e.g. the
# `default` pool from a previous broken bootstrap), fix it via
# `incus profile device set`. Else add fresh.
ansible.builtin.shell: |
POOL="{{ storage_pool.stdout | trim }}"
existing=$(incus profile device get {{ item }} root pool 2>/dev/null || true)
if [ "$existing" = "$POOL" ]; then
echo "root device on $POOL already"
exit 0
fi
if [ -n "$existing" ]; then
# Device exists with wrong pool — correct it.
incus profile device set {{ item }} root pool "$POOL"
echo "root device repointed to $POOL"
else
incus profile device add {{ item }} root disk path=/ pool="$POOL"
echo "root device added on $POOL"
fi
register: profile_root
changed_when: "'already' not in profile_root.stdout"
loop:
- veza-app
- veza-data
- name: Detect legacy empty veza-net profile
ansible.builtin.command: incus profile show veza-net
register: vnet_show
failed_when: false
changed_when: false
- name: Drop legacy veza-net profile if it exists and has no devices
ansible.builtin.command: incus profile delete veza-net
when:
- vnet_show.rc == 0
- "'devices: {}' in vnet_show.stdout"
changed_when: true
# =====================================================================
# Phase 2 — forgejo-runner gets Incus socket + nesting + binary
# =====================================================================
- name: Phase 2 — forgejo-runner Incus access
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Verify forgejo-runner container exists
ansible.builtin.command: incus info forgejo-runner
register: runner_info
failed_when: runner_info.rc != 0
changed_when: false
- name: Check if incus-socket device is already attached
ansible.builtin.shell: |
incus config device show forgejo-runner | grep -q '^incus-socket:'
register: socket_attached
failed_when: false
changed_when: false
- name: Attach /var/lib/incus/unix.socket as a disk device
ansible.builtin.command: >-
incus config device add forgejo-runner incus-socket disk
source=/var/lib/incus/unix.socket
path=/var/lib/incus/unix.socket
when: socket_attached.rc != 0
register: device_attached
- name: Read current security.nesting setting
ansible.builtin.command: incus config get forgejo-runner security.nesting
register: nesting_val
changed_when: false
- name: Enable security.nesting=true
ansible.builtin.command: incus config set forgejo-runner security.nesting=true
when: nesting_val.stdout | trim != "true"
register: nesting_set
- name: Restart forgejo-runner if device or nesting changed
ansible.builtin.command: incus restart forgejo-runner
when:
- device_attached.changed | default(false) or nesting_set.changed | default(false)
- name: Wait for forgejo-runner to be reachable after restart
ansible.builtin.command: incus exec forgejo-runner -- /bin/true
register: runner_ready
until: runner_ready.rc == 0
retries: 30
delay: 1
changed_when: false
- name: Check whether incus binary is already in the runner
ansible.builtin.command: incus exec forgejo-runner -- test -x /usr/local/bin/incus
register: binary_present
failed_when: false
changed_when: false
- name: Push host's /usr/bin/incus into runner:/usr/local/bin/incus
ansible.builtin.command: >-
incus file push /usr/bin/incus
forgejo-runner/usr/local/bin/incus
--mode 0755
when: binary_present.rc != 0
- name: Smoke-test runner can reach Incus socket
ansible.builtin.command: incus exec forgejo-runner -- /usr/local/bin/incus list
register: smoketest
failed_when: false
changed_when: false
- name: Warn if smoke-test failed (non-fatal — depends on runner user perms)
ansible.builtin.debug:
msg: >-
forgejo-runner cannot list Incus from its default user (rc={{ smoketest.rc }}).
This is OK if the systemd unit runs as root inside the container ;
if not, the runner user needs gid alignment with the host's incus-admin group.
when: smoketest.rc != 0
# =====================================================================
# Phase 3 — forgejo-runner registered with `incus` label
#
# Runs on the Incus HOST and reaches the runner container via
# `incus exec forgejo-runner -- ...`. This avoids the
# community.general.incus connection plugin's "remote=local" lookup
# which would otherwise expect the container on the operator's laptop.
# =====================================================================
- name: Phase 3 — forgejo-runner labels
hosts: incus_hosts
become: true
gather_facts: false
tasks:
- name: Locate the runner config file
ansible.builtin.shell: |
for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner; do
if incus exec forgejo-runner -- test -f "$f" 2>/dev/null; then
echo "$f"
exit 0
fi
done
exit 1
register: runner_cfg_path
failed_when: false
changed_when: false
- name: Read existing labels (if config file exists)
ansible.builtin.shell: |
incus exec forgejo-runner -- bash -c "
jq -r '.labels[]?' '{{ runner_cfg_path.stdout }}' 2>/dev/null \
|| grep -oE '\"labels\":\[[^]]+\]' '{{ runner_cfg_path.stdout }}' 2>/dev/null \
|| echo ''
"
register: existing_labels
when: runner_cfg_path.rc == 0
changed_when: false
failed_when: false
- name: Stop here if 'incus' label is already present
ansible.builtin.meta: end_play
when:
- runner_cfg_path.rc == 0
- existing_labels.stdout is defined
- "'incus' in existing_labels.stdout"
- name: Detect runner binary inside the container
ansible.builtin.shell: |
incus exec forgejo-runner -- bash -c "
for b in forgejo-runner act_runner; do
command -v \$b >/dev/null 2>&1 && echo \$b && exit 0
done
exit 1
"
register: runner_bin
changed_when: false
failed_when: runner_bin.rc != 0
- name: Stop the runner systemd unit
ansible.builtin.command: >-
incus exec forgejo-runner -- systemctl stop {{ runner_bin.stdout }}.service
register: stop_unit
failed_when: false
changed_when: stop_unit.rc == 0
- name: Remove old .runner config to force re-registration
ansible.builtin.command: >-
incus exec forgejo-runner -- rm -f {{ runner_cfg_path.stdout }}
when: runner_cfg_path.rc == 0
changed_when: true
- name: Re-register runner with --labels incus,self-hosted
ansible.builtin.command: >-
incus exec forgejo-runner --
{{ runner_bin.stdout }} register
--no-interactive
--instance {{ forgejo_api_url }}
--token {{ forgejo_registration_token }}
--name r720-incus
--labels incus,self-hosted
no_log: true # token is sensitive
changed_when: true
- name: Start (and enable) the runner systemd unit
ansible.builtin.command: >-
incus exec forgejo-runner -- systemctl enable --now {{ runner_bin.stdout }}.service
changed_when: true