veza/scripts/bootstrap/bootstrap-remote.sh

#!/usr/bin/env bash
# bootstrap-remote.sh — runs ON the R720, invoked over SSH by
# bootstrap-local.sh. Idempotent ; resumable via PHASE env var.
#
# Inputs (from SSH-passed env vars) :
#   FORGEJO_REGISTRATION_TOKEN    short-lived token to register runner
#   FORGEJO_API_URL               default: https://forgejo.talas.group
#
# Each phase logs to /var/log/talas-bootstrap.log AND emits structured
# >>>PHASE:<name>:<status><<< markers on stdout for the local script.

# lib.sh is concatenated upstream by bootstrap-local before this file is
# piped to bash. When run standalone, source it manually.
if ! declare -F info >/dev/null 2>&1; then
    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
    # shellcheck source=lib.sh
    . "$SCRIPT_DIR/lib.sh"
fi
trap_errors

# Persistent log on R720 — useful when the SSH stream gets cut off.
exec > >(tee -a /var/log/talas-bootstrap.log) 2>&1

: "${FORGEJO_API_URL:=https://forgejo.talas.group}"

# ============================================================================
# Phase R1 — Incus profiles
# ============================================================================
remote_phase_1_profiles() {
    section "R1 — Incus profiles (veza-app, veza-data)"
    _current_phase=r1_profiles
    phase r1_profiles START

    if skip_if_done r1_profiles "incus profiles"; then
        phase r1_profiles DONE; return 0
    fi

    # Two profiles only — `veza-app` for app/edge containers, `veza-data`
    # for the persistent data tier. Both empty by default (the operator
    # adds resource limits / AppArmor rules later). The network device
    # is NOT attached here ; playbooks pass `--network <name>` at launch
    # so the caller controls which bridge the container lands on.
    # An older revision created a `veza-net` profile too — drop it if
    # it's there from a previous bootstrap, since it's redundant with
    # the explicit --network flag.
    for p in veza-app veza-data; do
        if incus profile show "$p" >/dev/null 2>&1; then
            ok "profile $p already exists"
        else
            incus profile create "$p"
            ok "profile $p created (empty — operator may add limits later)"
        fi
    done

    if incus profile show veza-net >/dev/null 2>&1; then
        if [[ "$(incus profile device list veza-net 2>/dev/null | wc -l)" -eq 0 ]]; then
            warn "found legacy empty profile 'veza-net' — removing (network is set via --network on launch)"
            incus profile delete veza-net 2>/dev/null || true
        else
            warn "legacy 'veza-net' profile has devices attached — leaving alone"
        fi
    fi

    mark_done r1_profiles
    phase r1_profiles DONE
}

# ============================================================================
# Phase R2 — mount Incus socket into forgejo-runner container
# ============================================================================
remote_phase_2_runner_socket() {
    section "R2 — mount /var/lib/incus/unix.socket into forgejo-runner"
    _current_phase=r2_runner_socket
    phase r2_runner_socket START

    if skip_if_done r2_runner_socket "runner socket mount"; then
        phase r2_runner_socket DONE; return 0
    fi

    if ! incus info forgejo-runner >/dev/null 2>&1; then
        die "container 'forgejo-runner' not found ; expected at the IP shown in the design"
    fi

    if incus config device show forgejo-runner 2>/dev/null | grep -q '^incus-socket:'; then
        ok "incus-socket device already attached"
    else
        info "attaching unix socket as a disk device"
        incus config device add forgejo-runner incus-socket disk \
            source=/var/lib/incus/unix.socket \
            path=/var/lib/incus/unix.socket >/dev/null
        ok "device added"
    fi

    if [[ "$(incus config get forgejo-runner security.nesting)" != "true" ]]; then
        info "enabling security.nesting"
        incus config set forgejo-runner security.nesting=true
        ok "nesting=true ; restart required"
        info "restarting forgejo-runner container"
        incus restart forgejo-runner
        sleep 3
    fi

    info "ensuring incus client binary is in the runner"
    if incus exec forgejo-runner -- command -v incus >/dev/null 2>&1; then
        ok "incus already in runner"
    elif [[ -x /usr/bin/incus ]]; then
        # Push the host's binary into the container — avoids apt repo
        # issues (Debian 13 doesn't ship incus-client as a separate
        # package, and the full `incus` package would also pull in the
        # daemon which we don't want in a runner container).
        info "pushing /usr/bin/incus from host into runner:/usr/local/bin/incus"
        incus file push /usr/bin/incus forgejo-runner/usr/local/bin/incus --mode 0755
        ok "incus binary pushed"
    else
        die "no /usr/bin/incus on host AND none in runner — install incus on the host first"
    fi

    info "smoke-test : runner can incus list"
    if incus exec forgejo-runner -- incus list >/dev/null 2>&1; then
        ok "runner has Incus access"
    else
        # Common cause : the runner's process can read /var/lib/incus/
        # unix.socket only if it has the right gid. The socket is owned
        # root:incus-admin (or equivalent) on the host. Inside the
        # container we either run as root (works) or need to add the
        # runner user to a group with the same gid as host's incus-admin.
        # We don't try to fix that here — it's runner-process-specific.
        warn "runner cannot incus list as default user"
        warn "this may be normal if the systemd unit runs as root inside"
        warn "the container ; if not, add the runner user to a group with"
        warn "the same gid as the host's incus-admin group"
    fi

    mark_done r2_runner_socket
    phase r2_runner_socket DONE
}

# ============================================================================
# Phase R3 — runner label = 'incus'
# ============================================================================
remote_phase_3_runner_labels() {
    section "R3 — forgejo-runner labelled 'incus,self-hosted'"
    _current_phase=r3_runner_labels
    phase r3_runner_labels START

    if skip_if_done r3_runner_labels "runner labels"; then
        phase r3_runner_labels DONE; return 0
    fi

    require_env FORGEJO_REGISTRATION_TOKEN \
        "set on the SSH command-line by bootstrap-local.sh"

    # Find the runner config inside the container. Path varies by install
    # method ; act_runner default is /etc/forgejo-runner/.runner.
    local runner_cfg
    runner_cfg=$(incus exec forgejo-runner -- bash -c '
        for f in /etc/forgejo-runner/.runner /var/lib/forgejo-runner/.runner /opt/forgejo-runner/.runner; do
            [[ -f "$f" ]] && echo "$f" && exit 0
        done
        exit 1
    ' 2>/dev/null) || true

    local labels=""
    if [[ -n "$runner_cfg" ]]; then
        labels=$(incus exec forgejo-runner -- jq -r '.labels[]?' "$runner_cfg" 2>/dev/null \
                || incus exec forgejo-runner -- grep -oE '"labels":\[[^]]+' "$runner_cfg" 2>/dev/null \
                || echo "")
    fi

    if echo "$labels" | grep -qw incus; then
        ok "runner already has 'incus' label"
        mark_done r3_runner_labels
        phase r3_runner_labels DONE
        return 0
    fi

    info "re-registering runner with labels incus,self-hosted"

    # Stop systemd unit, wipe old registration, re-register, start.
    incus exec forgejo-runner -- systemctl stop forgejo-runner.service 2>/dev/null \
        || incus exec forgejo-runner -- systemctl stop act_runner.service 2>/dev/null \
        || warn "no systemd unit to stop ; will skip"

    [[ -n "$runner_cfg" ]] && incus exec forgejo-runner -- rm -f "$runner_cfg"

    # Detect runner binary name
    local runner_bin
    runner_bin=$(incus exec forgejo-runner -- bash -c '
        for b in forgejo-runner act_runner; do
            command -v "$b" >/dev/null 2>&1 && echo "$b" && exit 0
        done
        exit 1
    ' 2>/dev/null) || die "no forgejo-runner / act_runner binary found in container"

    incus exec forgejo-runner -- "$runner_bin" register \
        --no-interactive \
        --instance "$FORGEJO_API_URL" \
        --token "$FORGEJO_REGISTRATION_TOKEN" \
        --name "r720-incus" \
        --labels "incus,self-hosted"

    incus exec forgejo-runner -- systemctl start "$runner_bin.service" \
        || incus exec forgejo-runner -- systemctl start forgejo-runner.service

    ok "runner re-registered with incus label"

    mark_done r3_runner_labels
    phase r3_runner_labels DONE
}

# ============================================================================
# Phase R4 — sanity, summary
# ============================================================================
remote_phase_4_sanity() {
    section "R4 — sanity check"
    _current_phase=r4_sanity
    phase r4_sanity START

    info "incus profiles :"
    incus profile list -f csv | grep -E '^veza-' | awk -F, '{print "  " $1}'

    info "forgejo-runner status :"
    incus exec forgejo-runner -- systemctl is-active forgejo-runner.service 2>/dev/null \
        || incus exec forgejo-runner -- systemctl is-active act_runner.service 2>/dev/null \
        || warn "no active runner service — verify manually"

    info "forgejo container reachable from runner :"
    if incus exec forgejo-runner -- curl -sSf -o /dev/null --max-time 5 \
            "$FORGEJO_API_URL" 2>/dev/null \
       || incus exec forgejo-runner -- curl -sSf -ko /dev/null --max-time 5 \
            https://10.0.20.105:3000/ 2>/dev/null \
       || incus exec forgejo-runner -- curl -sSf -o /dev/null --max-time 5 \
            http://10.0.20.105:3000/ 2>/dev/null; then
        ok "runner can reach Forgejo"
    else
        warn "runner cannot reach Forgejo — check WireGuard / DNS / firewall"
    fi

    mark_done r4_sanity
    phase r4_sanity DONE
}

main() {
    local start=${PHASE:-1}
    info "remote bootstrap starting at phase $start (log: /var/log/talas-bootstrap.log)"

    [[ $start -le 1 ]] && remote_phase_1_profiles
    [[ $start -le 2 ]] && remote_phase_2_runner_socket
    [[ $start -le 3 ]] && remote_phase_3_runner_labels
    [[ $start -le 4 ]] && remote_phase_4_sanity

    ok "remote bootstrap done"
}

main "$@"