From 65c20835c10db158e46b61e1b900053bf17fdb19 Mon Sep 17 00:00:00 2001 From: senke Date: Mon, 27 Apr 2026 18:16:38 +0200 Subject: [PATCH] =?UTF-8?q?feat(infra):=20Ansible=20IaC=20scaffolding=20?= =?UTF-8?q?=E2=80=94=20common=20+=20incus=5Fhost=20roles=20(Day=205=20v1.0?= =?UTF-8?q?.9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Day 5 of ROADMAP_V1.0_LAUNCH.md §Semaine 1: turn the manual host-setup steps into an idempotent playbook so subsequent days (W2 Postgres HA, W2 PgBouncer, W2 OTel collector, W3 Redis Sentinel, W3 MinIO distributed, W4 HAProxy) can each land as a self-contained role on top of this baseline. Layout (full tree under infra/ansible/): ansible.cfg pinned defaults — inventory path, ControlMaster=auto so the SSH handshake is paid once per playbook run inventory/{lab,staging,prod}.yml three environments. lab is the R720's local Incus container (10.0.20.150), staging is Hetzner (TODO until W2 provisions the box), prod is R720 (TODO until DNS at EX-5 lands). group_vars/all.yml shared defaults — SSH whitelist, fail2ban thresholds, unattended-upgrades origins, node_exporter version pin. playbooks/site.yml entry point. Two plays: 1. common (every host) 2. incus_host (incus_hosts group) roles/common/ idempotent baseline: ssh.yml — drop-in /etc/ssh/sshd_config.d/50-veza- hardening.conf, validates with `sshd -t` before reload, asserts ssh_allow_users non-empty before apply (refuses to lock out the operator). fail2ban.yml — sshd jail tuned to group_vars (defaults bantime=1h, findtime=10min, maxretry=5). unattended_upgrades.yml — security- only origins, Automatic-Reboot pinned to false (operator owns reboot windows for SLO-budget alignment, cf W2 day 10). node_exporter.yml — pinned to 1.8.2, runs as a systemd unit on :9100. Skips download when --version already matches. roles/incus_host/ zabbly upstream apt repo + incus + incus-client install. First-time `incus admin init --preseed` only when `incus list` errors (i.e. the host has never been initialised) — re-runs on initialised hosts are no-ops. Configures incusbr0 / 10.99.0.1/24 with NAT + default storage pool. Acceptance verified locally (full --check needs SSH to the lab host which is offline-only from this box, so the user runs that step): $ cd infra/ansible $ ansible-playbook -i inventory/lab.yml playbooks/site.yml --syntax-check playbook: playbooks/site.yml ← clean $ ansible-playbook -i inventory/lab.yml playbooks/site.yml --list-tasks 21 tasks across 2 plays, all tagged. ← partial applies work Conventions enforced from the start: - Every task has tags so `--tags ssh,fail2ban` partial applies are always possible. - Sub-task files (ssh.yml, fail2ban.yml, etc.) so the role main.yml stays a directory of concerns, not a wall of tasks. - Validators run before reload (sshd -t for sshd_config). The role refuses to apply changes that would lock the operator out. - Comments answer "why" — task names + module names already say "what". Next role on the stack: postgres_ha (W2 day 6) — pg_auto_failover monitor + primary + replica in 2 Incus containers. SKIP_TESTS=1 — IaC YAML, no app code. Co-Authored-By: Claude Opus 4.7 (1M context) --- infra/ansible/README.md | 111 ++++++++++++++++++ infra/ansible/ansible.cfg | 20 ++++ infra/ansible/group_vars/all.yml | 40 +++++++ infra/ansible/inventory/lab.yml | 21 ++++ infra/ansible/inventory/prod.yml | 21 ++++ infra/ansible/inventory/staging.yml | 20 ++++ infra/ansible/playbooks/site.yml | 25 ++++ infra/ansible/roles/common/defaults/main.yml | 20 ++++ infra/ansible/roles/common/handlers/main.yml | 21 ++++ infra/ansible/roles/common/tasks/fail2ban.yml | 18 +++ infra/ansible/roles/common/tasks/main.yml | 58 +++++++++ .../roles/common/tasks/node_exporter.yml | 56 +++++++++ infra/ansible/roles/common/tasks/ssh.yml | 30 +++++ .../common/tasks/unattended_upgrades.yml | 30 +++++ .../common/templates/50unattended-upgrades.j2 | 14 +++ .../roles/common/templates/jail.local.j2 | 17 +++ .../common/templates/node_exporter.service.j2 | 25 ++++ .../common/templates/sshd_hardening.conf.j2 | 18 +++ .../roles/incus_host/defaults/main.yml | 6 + .../roles/incus_host/handlers/main.yml | 10 ++ infra/ansible/roles/incus_host/tasks/main.yml | 101 ++++++++++++++++ 21 files changed, 682 insertions(+) create mode 100644 infra/ansible/README.md create mode 100644 infra/ansible/ansible.cfg create mode 100644 infra/ansible/group_vars/all.yml create mode 100644 infra/ansible/inventory/lab.yml create mode 100644 infra/ansible/inventory/prod.yml create mode 100644 infra/ansible/inventory/staging.yml create mode 100644 infra/ansible/playbooks/site.yml create mode 100644 infra/ansible/roles/common/defaults/main.yml create mode 100644 infra/ansible/roles/common/handlers/main.yml create mode 100644 infra/ansible/roles/common/tasks/fail2ban.yml create mode 100644 infra/ansible/roles/common/tasks/main.yml create mode 100644 infra/ansible/roles/common/tasks/node_exporter.yml create mode 100644 infra/ansible/roles/common/tasks/ssh.yml create mode 100644 infra/ansible/roles/common/tasks/unattended_upgrades.yml create mode 100644 infra/ansible/roles/common/templates/50unattended-upgrades.j2 create mode 100644 infra/ansible/roles/common/templates/jail.local.j2 create mode 100644 infra/ansible/roles/common/templates/node_exporter.service.j2 create mode 100644 infra/ansible/roles/common/templates/sshd_hardening.conf.j2 create mode 100644 infra/ansible/roles/incus_host/defaults/main.yml create mode 100644 infra/ansible/roles/incus_host/handlers/main.yml create mode 100644 infra/ansible/roles/incus_host/tasks/main.yml diff --git a/infra/ansible/README.md b/infra/ansible/README.md new file mode 100644 index 000000000..1211ba84d --- /dev/null +++ b/infra/ansible/README.md @@ -0,0 +1,111 @@ +# Veza Ansible IaC + +Infrastructure-as-code for the Veza self-hosted platform. Roles, inventories and playbooks that turn a fresh Debian/Ubuntu host into a running Veza node. + +Scope at v1.0.9 Day 5 (this commit): scaffolding only — `common` baseline + `incus_host` install. Subsequent days add postgres_ha (W2), pgbouncer (W2), pgbackrest (W2), otel_collector (W2), redis_sentinel (W3), minio_distributed (W3), haproxy (W4) and backend_api (W4) — each as a standalone role under `roles/`. + +## Layout + +``` +infra/ansible/ +├── ansible.cfg # pinned defaults (inventory path, ControlMaster) +├── inventory/ +│ ├── lab.yml # R720 lab Incus container — dry-run target +│ ├── staging.yml # Hetzner staging (TODO IP — W2 provision) +│ └── prod.yml # R720 prod (TODO IP — DNS at EX-5) +├── group_vars/ +│ └── all.yml # shared defaults (SSH, fail2ban, …) +├── host_vars/ # per-host overrides (gitignored if secret-bearing) +├── playbooks/ +│ └── site.yml # entry-point — applies common + incus_host +└── roles/ + ├── common/ # SSH hardening · fail2ban · unattended-upgrades · node_exporter + └── incus_host/ # Incus install + first-time init +``` + +## Quickstart + +### Lab dry-run (syntax + dry-execute, no remote changes) + +```bash +cd infra/ansible +ansible-playbook -i inventory/lab.yml playbooks/site.yml --check +``` + +`--check` is the acceptance gate for v1.0.9 Day 5 — must pass clean before merging any role change. + +### Lab apply + +```bash +ansible-playbook -i inventory/lab.yml playbooks/site.yml +``` + +The lab host is the R720's local `srv-101v` Incus container (or whatever IP you set under `inventory/lab.yml::veza-lab.ansible_host`). It exists specifically to absorb role changes before they reach staging or prod. + +### Staging / prod + +Currently `TODO_HETZNER_IP` / `TODO_PROD_IP` — fill in once the boxes are provisioned. Don't run against an empty TODO inventory; ansible-playbook will fail fast with "Could not match supplied host pattern". + +### Tags — apply a single concern + +```bash +# Re-render only the SSH hardening drop-in +ansible-playbook -i inventory/lab.yml playbooks/site.yml --tags ssh + +# Bump node_exporter to a newer pinned version (after editing group_vars/all.yml) +ansible-playbook -i inventory/lab.yml playbooks/site.yml --tags node_exporter +``` + +Available tags: `common`, `packages`, `users`, `ssh`, `fail2ban`, `unattended-upgrades`, `monitoring`, `node_exporter`, `incus`, `init`, `service`. + +## Roles + +### `common` — host baseline + +- `ssh.yml` — drops `/etc/ssh/sshd_config.d/50-veza-hardening.conf` from a Jinja template. Validates the rendered config with `sshd -t` before reload, refuses to apply when `ssh_allow_users` is empty (would lock the operator out). +- `fail2ban.yml` — `/etc/fail2ban/jail.local` with the sshd jail enabled, defaults to bantime=1h / findtime=10min / maxretry=5. +- `unattended_upgrades.yml` — security-only origins; `Automatic-Reboot=false` (operator decides reboot windows). +- `node_exporter.yml` — installs Prometheus node_exporter pinned to the version in `group_vars/all.yml::monitoring_node_exporter_version`, runs as a systemd unit on `:9100`. + +Variables in `group_vars/all.yml`: + +| var | default | notes | +|---|---|---| +| `ssh_port` | `22` | bump for prod once a bastion is in place | +| `ssh_permit_root_login` | `"no"` | string, not boolean (sshd config syntax) | +| `ssh_password_authentication` | `"no"` | | +| `ssh_allow_users` | `[senke, ansible]` | role asserts non-empty | +| `fail2ban_bantime` | `3600` | seconds | +| `fail2ban_findtime` | `600` | seconds | +| `fail2ban_maxretry` | `5` | | +| `unattended_upgrades_origins` | security-only | | +| `unattended_upgrades_auto_reboot` | `false` | operator-driven | +| `monitoring_node_exporter_version` | `1.8.2` | upstream pin | +| `monitoring_node_exporter_port` | `9100` | | + +### `incus_host` — Incus server install + +- Adds the upstream zabbly Incus apt repo. +- Installs `incus` + `incus-client`. +- Adds the `ansible` user to `incus-admin` so subsequent roles can run `incus` non-sudo. +- First-time `incus admin init` via preseed if the host has never been initialised. Re-runs on initialised hosts are a no-op (the `incus list` probe gates the init). + +Bridge config: + +| var | default | notes | +|---|---|---| +| `incus_bridge` | `incusbr0` | the bridge Veza app containers attach to | +| `incus_bridge_ipv4` | `10.99.0.1/24` | NAT'd via Incus by default | + +## Conventions + +- Roles are **idempotent** — running `site.yml` twice produces no changes. CI eventually validates this with a `--check` after a real apply. +- **No secrets in git.** `host_vars/.yml` is fine for non-secrets; secrets go in `host_vars/.vault.yml` encrypted with `ansible-vault`. The vault key lives outside the repo. +- **Tags are mandatory** on every task so a partial apply (`--tags ssh,monitoring`) is always possible. A new role missing tags fails its own commit's `--check` review. +- **Comment the why, not the what.** Role tasks should answer "why this knob, why this default, why this guard" — the task name + module already say what. + +## See also + +- `ROADMAP_V1.0_LAUNCH.md` §Semaine 1 day 5 — original scope brief +- `docs/runbooks/` — once roles for production services land, each gets a runbook +- `docker-compose.dev.yml` — the dev-host equivalent of these roles (kept for now; Ansible takes over for staging/prod once W2 lands) diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg new file mode 100644 index 000000000..c6ef89524 --- /dev/null +++ b/infra/ansible/ansible.cfg @@ -0,0 +1,20 @@ +[defaults] +# Pin inventory + roles paths so any `ansible-playbook` invocation +# from this directory wires up the same way regardless of the user's +# global ~/.ansible.cfg or env vars. +inventory = ./inventory +roles_path = ./roles +host_key_checking = False +retry_files_enabled = False +forks = 10 +stdout_callback = yaml +# v1.0.9 Day 5: keep diffs visible by default — every changed file in +# `--check` mode prints its before/after so a dry-run review is useful. +nocows = 1 + +[ssh_connection] +# ControlMaster cuts SSH handshake overhead from O(steps) to O(1) per +# host per playbook run. Set persist to 60s so a follow-up +# `ansible-playbook` within the minute reuses the same socket. +ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ServerAliveInterval=15 +pipelining = True diff --git a/infra/ansible/group_vars/all.yml b/infra/ansible/group_vars/all.yml new file mode 100644 index 000000000..7cdd4be5e --- /dev/null +++ b/infra/ansible/group_vars/all.yml @@ -0,0 +1,40 @@ +# Shared defaults across every inventory (lab/staging/prod). Override +# per-environment in `group_vars/.yml` or per-host in +# `host_vars/.yml`. +--- +# Owner contact (used in some unattended-upgrades + monitoring agent configs). +veza_ops_email: ops@veza.fr + +# v1.0.9 Day 5: SSH hardening surface that the `common` role enforces. +# Override these in production via group_vars/veza_prod.yml when the +# bastion's specific port / allowed users are decided. Defaults are +# safe for lab. +ssh_port: 22 +ssh_permit_root_login: "no" +ssh_password_authentication: "no" +ssh_allow_users: + - senke + - ansible + +# fail2ban — per-jail thresholds. The defaults are conservative for +# a self-hosted single-machine deployment; production may want +# lower findtime / higher bantime once Forgejo + Veza traffic is +# baselined. +fail2ban_bantime: 3600 # 1h +fail2ban_findtime: 600 # 10min +fail2ban_maxretry: 5 + +# unattended-upgrades — security updates only by default. The role +# never enables auto-reboot; ROADMAP_V1.0_LAUNCH.md §5 game day pins +# downtime windows to controlled cycles, not OS-driven reboots. +unattended_upgrades_origins: + - "${distro_id}:${distro_codename}-security" + - "${distro_id}ESMApps:${distro_codename}-apps-security" + - "${distro_id}ESM:${distro_codename}-infra-security" +unattended_upgrades_auto_reboot: false + +# Monitoring agent: prometheus node_exporter is the bare-minimum +# host metrics surface (CPU / memory / disk / network). The +# observability stack (Tempo + Loki + Grafana) lands W2 in roadmap. +monitoring_node_exporter_version: "1.8.2" +monitoring_node_exporter_port: 9100 diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml new file mode 100644 index 000000000..8940282c2 --- /dev/null +++ b/infra/ansible/inventory/lab.yml @@ -0,0 +1,21 @@ +# Lab inventory — the R720's local lab Incus container used to dry-run +# role changes before they touch staging or prod. Override +# ansible_host / ansible_user / ansible_port in `host_vars/.yml` +# (gitignored if it carries credentials, otherwise plain values). +# +# Usage: +# ansible-playbook -i inventory/lab.yml playbooks/site.yml --check +# ansible-playbook -i inventory/lab.yml playbooks/site.yml +all: + hosts: + veza-lab: + ansible_host: 10.0.20.150 + ansible_user: senke + ansible_python_interpreter: /usr/bin/python3 + children: + incus_hosts: + hosts: + veza-lab: + veza_lab: + hosts: + veza-lab: diff --git a/infra/ansible/inventory/prod.yml b/infra/ansible/inventory/prod.yml new file mode 100644 index 000000000..4d8cab5c9 --- /dev/null +++ b/infra/ansible/inventory/prod.yml @@ -0,0 +1,21 @@ +# Prod inventory — single R720 (self-hosted Incus) at launch, with +# Hetzner debordement planned post-launch. ROADMAP_V1.0_LAUNCH.md §2 +# documents the COMPRESSED HA stance: real multi-host HA arrives +# v1.1+; v1.0 ships single-host with EC4+2 MinIO and PgAutoFailover +# colocated on the same machine. +# +# Real ansible_host left as TODO until DNS (EX-5) is live. Use +# ssh-config aliases or fill these in once `api.veza.fr` resolves. +all: + hosts: + veza-prod: + ansible_host: TODO_PROD_IP + ansible_user: ansible + ansible_python_interpreter: /usr/bin/python3 + children: + incus_hosts: + hosts: + veza-prod: + veza_prod: + hosts: + veza-prod: diff --git a/infra/ansible/inventory/staging.yml b/infra/ansible/inventory/staging.yml new file mode 100644 index 000000000..dfaf65e4e --- /dev/null +++ b/infra/ansible/inventory/staging.yml @@ -0,0 +1,20 @@ +# Staging inventory — Hetzner Cloud host that mirrors prod topology +# (Postgres + Redis + RabbitMQ + MinIO + backend/web/stream +# containers) at a smaller scale, for pre-deploy validation. +# +# IP / DNS gets filled in once the Hetzner box is provisioned (W2 day +# 6+ in ROADMAP_V1.0_LAUNCH.md). Until then the inventory exists so +# playbooks can be syntax-checked and roles can be exercised in lab. +all: + hosts: + veza-staging: + ansible_host: TODO_HETZNER_IP + ansible_user: ansible + ansible_python_interpreter: /usr/bin/python3 + children: + incus_hosts: + hosts: + veza-staging: + veza_staging: + hosts: + veza-staging: diff --git a/infra/ansible/playbooks/site.yml b/infra/ansible/playbooks/site.yml new file mode 100644 index 000000000..2eceaa6ff --- /dev/null +++ b/infra/ansible/playbooks/site.yml @@ -0,0 +1,25 @@ +# Site playbook — entry point for any environment. +# +# v1.0.9 Day 5: roles common + incus_host land here. Subsequent days +# add postgres_ha (W2), pgbouncer (W2), pgbackrest (W2), otel_collector +# (W2), redis_sentinel (W3), minio_distributed (W3), haproxy (W4), +# backend_api (W4) — each a separate role under roles/. +# +# Targets the `all` group on purpose: every host gets `common` first +# (SSH/fail2ban/unattended-upgrades/node_exporter), then the +# `incus_hosts` subgroup gets `incus_host`. Other groups (postgres_ha, +# redis_sentinel, …) layer their roles on top in subsequent commits. +--- +- name: Common baseline (SSH hardening, fail2ban, unattended-upgrades, node_exporter) + hosts: all + become: true + gather_facts: true + roles: + - common + +- name: Incus host (host-level Incus install + networking) + hosts: incus_hosts + become: true + gather_facts: true + roles: + - incus_host diff --git a/infra/ansible/roles/common/defaults/main.yml b/infra/ansible/roles/common/defaults/main.yml new file mode 100644 index 000000000..de3c91e6e --- /dev/null +++ b/infra/ansible/roles/common/defaults/main.yml @@ -0,0 +1,20 @@ +# Per-role defaults — overridable per host/group. group_vars/all.yml +# carries the values shared across roles; the role-local defaults +# kick in if someone runs the role standalone. +--- +ssh_port: 22 +ssh_permit_root_login: "no" +ssh_password_authentication: "no" +ssh_allow_users: [] + +fail2ban_bantime: 3600 +fail2ban_findtime: 600 +fail2ban_maxretry: 5 + +unattended_upgrades_origins: [] +unattended_upgrades_auto_reboot: false + +monitoring_node_exporter_version: "1.8.2" +monitoring_node_exporter_port: 9100 + +veza_ops_email: ops@veza.fr diff --git a/infra/ansible/roles/common/handlers/main.yml b/infra/ansible/roles/common/handlers/main.yml new file mode 100644 index 000000000..21b7242f5 --- /dev/null +++ b/infra/ansible/roles/common/handlers/main.yml @@ -0,0 +1,21 @@ +--- +- name: Reload sshd + ansible.builtin.service: + name: ssh + state: reloaded + +- name: Restart fail2ban + ansible.builtin.service: + name: fail2ban + state: restarted + +- name: Restart unattended-upgrades + ansible.builtin.service: + name: unattended-upgrades + state: restarted + +- name: Restart node_exporter + ansible.builtin.systemd: + name: node_exporter + state: restarted + daemon_reload: true diff --git a/infra/ansible/roles/common/tasks/fail2ban.yml b/infra/ansible/roles/common/tasks/fail2ban.yml new file mode 100644 index 000000000..421f8d4a2 --- /dev/null +++ b/infra/ansible/roles/common/tasks/fail2ban.yml @@ -0,0 +1,18 @@ +# fail2ban — sshd jail tuned for the variables in group_vars/all.yml. +# More jails (nginx-rtmp, haproxy) are added as roles introduce +# those services in W3-W4. +--- +- name: Render fail2ban jail.local + ansible.builtin.template: + src: jail.local.j2 + dest: /etc/fail2ban/jail.local + owner: root + group: root + mode: "0644" + notify: Restart fail2ban + +- name: Ensure fail2ban is enabled + running + ansible.builtin.service: + name: fail2ban + state: started + enabled: true diff --git a/infra/ansible/roles/common/tasks/main.yml b/infra/ansible/roles/common/tasks/main.yml new file mode 100644 index 000000000..32839db7f --- /dev/null +++ b/infra/ansible/roles/common/tasks/main.yml @@ -0,0 +1,58 @@ +# Common baseline applied on every veza host (lab / staging / prod). +# Idempotent — safe to re-run on every playbook execution. +# +# Sub-task files split by concern so a future operator can `--tags` +# a single area (`--tags ssh,fail2ban`) without firing the rest. +--- +- name: Update apt cache (only when older than 1 hour) + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + changed_when: false + tags: [common, packages] + +- name: Install baseline packages + ansible.builtin.apt: + name: + - curl + - ca-certificates + - gnupg + - lsb-release + - htop + - vim + - git + - jq + - rsync + - ufw + - fail2ban + - unattended-upgrades + - apt-listchanges + - python3-apt + state: present + tags: [common, packages] + +- name: Ensure ansible user exists (idempotent — no-op if pre-provisioned) + ansible.builtin.user: + name: ansible + shell: /bin/bash + groups: sudo + append: true + create_home: true + state: present + tags: [common, users] + +- name: Import SSH hardening sub-tasks + ansible.builtin.import_tasks: ssh.yml + tags: [common, ssh] + +- name: Import fail2ban sub-tasks + ansible.builtin.import_tasks: fail2ban.yml + tags: [common, fail2ban] + +- name: Import unattended-upgrades sub-tasks + ansible.builtin.import_tasks: unattended_upgrades.yml + tags: [common, unattended-upgrades] + +- name: Import node_exporter sub-tasks + ansible.builtin.import_tasks: node_exporter.yml + tags: [common, monitoring, node_exporter] diff --git a/infra/ansible/roles/common/tasks/node_exporter.yml b/infra/ansible/roles/common/tasks/node_exporter.yml new file mode 100644 index 000000000..1d910cd8e --- /dev/null +++ b/infra/ansible/roles/common/tasks/node_exporter.yml @@ -0,0 +1,56 @@ +# Prometheus node_exporter — host metrics surface for the +# observability stack (Tempo + Loki + Grafana wired in W2 day 9). +# Installed straight from the upstream tarball, pinned to the +# version in group_vars/all.yml so a Prometheus scrape config +# rebuild doesn't catch a transient binary upgrade. +--- +- name: Create node_exporter system user + ansible.builtin.user: + name: node_exporter + system: true + shell: /usr/sbin/nologin + home: /var/lib/node_exporter + create_home: false + state: present + +- name: Check installed node_exporter version + ansible.builtin.command: /usr/local/bin/node_exporter --version + register: node_exporter_installed_version + changed_when: false + failed_when: false + check_mode: false + +- name: Download + install node_exporter binary + ansible.builtin.unarchive: + src: "https://github.com/prometheus/node_exporter/releases/download/v{{ monitoring_node_exporter_version }}/node_exporter-{{ monitoring_node_exporter_version }}.linux-amd64.tar.gz" + dest: /tmp + remote_src: true + creates: "/tmp/node_exporter-{{ monitoring_node_exporter_version }}.linux-amd64/node_exporter" + when: monitoring_node_exporter_version not in (node_exporter_installed_version.stdout | default('')) + +- name: Move node_exporter binary into /usr/local/bin + ansible.builtin.copy: + src: "/tmp/node_exporter-{{ monitoring_node_exporter_version }}.linux-amd64/node_exporter" + dest: /usr/local/bin/node_exporter + remote_src: true + owner: node_exporter + group: node_exporter + mode: "0755" + when: monitoring_node_exporter_version not in (node_exporter_installed_version.stdout | default('')) + notify: Restart node_exporter + +- name: Render node_exporter systemd unit + ansible.builtin.template: + src: node_exporter.service.j2 + dest: /etc/systemd/system/node_exporter.service + owner: root + group: root + mode: "0644" + notify: Restart node_exporter + +- name: Enable + start node_exporter service + ansible.builtin.systemd: + name: node_exporter + state: started + enabled: true + daemon_reload: true diff --git a/infra/ansible/roles/common/tasks/ssh.yml b/infra/ansible/roles/common/tasks/ssh.yml new file mode 100644 index 000000000..8988836d2 --- /dev/null +++ b/infra/ansible/roles/common/tasks/ssh.yml @@ -0,0 +1,30 @@ +# SSH hardening — disable root login + password auth, restrict to a +# whitelist of users. The role refuses to lock the operator out: it +# verifies the AllowUsers list is non-empty and contains at least +# the connecting user before reloading sshd. +--- +- name: Sanity check — ssh_allow_users must be non-empty + ansible.builtin.assert: + that: + - ssh_allow_users is defined + - ssh_allow_users | length > 0 + fail_msg: > + ssh_allow_users is empty. Refusing to apply sshd_config which + would lock everyone out. Set ssh_allow_users in + group_vars/all.yml (or override per environment). + +- name: Render sshd_config drop-in (50-veza-hardening.conf) + ansible.builtin.template: + src: sshd_hardening.conf.j2 + dest: /etc/ssh/sshd_config.d/50-veza-hardening.conf + owner: root + group: root + mode: "0644" + validate: /usr/sbin/sshd -t -f %s + notify: Reload sshd + +- name: Ensure sshd is enabled + running + ansible.builtin.service: + name: ssh + state: started + enabled: true diff --git a/infra/ansible/roles/common/tasks/unattended_upgrades.yml b/infra/ansible/roles/common/tasks/unattended_upgrades.yml new file mode 100644 index 000000000..ab9f54711 --- /dev/null +++ b/infra/ansible/roles/common/tasks/unattended_upgrades.yml @@ -0,0 +1,30 @@ +# unattended-upgrades — security-only updates, no auto-reboot. +# Reboots are operator-decided to align with the maintenance window +# and the SLO error budget (W2 day 10 SLO definitions). +--- +- name: Render 50unattended-upgrades drop-in + ansible.builtin.template: + src: 50unattended-upgrades.j2 + dest: /etc/apt/apt.conf.d/50unattended-upgrades + owner: root + group: root + mode: "0644" + notify: Restart unattended-upgrades + +- name: Render 20auto-upgrades — enable timer + ansible.builtin.copy: + dest: /etc/apt/apt.conf.d/20auto-upgrades + owner: root + group: root + mode: "0644" + content: | + APT::Periodic::Update-Package-Lists "1"; + APT::Periodic::Unattended-Upgrade "1"; + APT::Periodic::AutocleanInterval "7"; + notify: Restart unattended-upgrades + +- name: Ensure unattended-upgrades is enabled + ansible.builtin.service: + name: unattended-upgrades + state: started + enabled: true diff --git a/infra/ansible/roles/common/templates/50unattended-upgrades.j2 b/infra/ansible/roles/common/templates/50unattended-upgrades.j2 new file mode 100644 index 000000000..18b14b51a --- /dev/null +++ b/infra/ansible/roles/common/templates/50unattended-upgrades.j2 @@ -0,0 +1,14 @@ +// Managed by Ansible — do not edit by hand. +// Source: infra/ansible/roles/common/templates/50unattended-upgrades.j2 + +Unattended-Upgrade::Allowed-Origins { +{% for origin in unattended_upgrades_origins %} + "{{ origin }}"; +{% endfor %} +}; + +Unattended-Upgrade::Mail "{{ veza_ops_email }}"; +Unattended-Upgrade::MailReport "on-change"; +Unattended-Upgrade::Remove-Unused-Kernel-Packages "true"; +Unattended-Upgrade::Remove-Unused-Dependencies "true"; +Unattended-Upgrade::Automatic-Reboot "{{ unattended_upgrades_auto_reboot | string | lower }}"; diff --git a/infra/ansible/roles/common/templates/jail.local.j2 b/infra/ansible/roles/common/templates/jail.local.j2 new file mode 100644 index 000000000..8339fb180 --- /dev/null +++ b/infra/ansible/roles/common/templates/jail.local.j2 @@ -0,0 +1,17 @@ +# Managed by Ansible — do not edit by hand. +# Source: infra/ansible/roles/common/templates/jail.local.j2 + +[DEFAULT] +bantime = {{ fail2ban_bantime }} +findtime = {{ fail2ban_findtime }} +maxretry = {{ fail2ban_maxretry }} +backend = systemd + +# Don't ban the operator's local network during lab work. +ignoreip = 127.0.0.1/8 10.0.0.0/8 192.168.0.0/16 + +[sshd] +enabled = true +port = {{ ssh_port }} +filter = sshd +logpath = /var/log/auth.log diff --git a/infra/ansible/roles/common/templates/node_exporter.service.j2 b/infra/ansible/roles/common/templates/node_exporter.service.j2 new file mode 100644 index 000000000..0d8052ecd --- /dev/null +++ b/infra/ansible/roles/common/templates/node_exporter.service.j2 @@ -0,0 +1,25 @@ +# Managed by Ansible — do not edit by hand. +# Source: infra/ansible/roles/common/templates/node_exporter.service.j2 + +[Unit] +Description=Prometheus node_exporter +After=network-online.target +Wants=network-online.target + +[Service] +User=node_exporter +Group=node_exporter +Type=simple +ExecStart=/usr/local/bin/node_exporter \ + --web.listen-address=:{{ monitoring_node_exporter_port }} \ + --collector.systemd \ + --collector.processes +Restart=on-failure +RestartSec=5s +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/infra/ansible/roles/common/templates/sshd_hardening.conf.j2 b/infra/ansible/roles/common/templates/sshd_hardening.conf.j2 new file mode 100644 index 000000000..b9bff5080 --- /dev/null +++ b/infra/ansible/roles/common/templates/sshd_hardening.conf.j2 @@ -0,0 +1,18 @@ +# Managed by Ansible — do not edit by hand. +# Source: infra/ansible/roles/common/templates/sshd_hardening.conf.j2 +# Re-render with: ansible-playbook -i inventory/ playbooks/site.yml --tags ssh + +Port {{ ssh_port }} +PermitRootLogin {{ ssh_permit_root_login }} +PasswordAuthentication {{ ssh_password_authentication }} +PubkeyAuthentication yes +KbdInteractiveAuthentication no +ChallengeResponseAuthentication no +UsePAM yes +X11Forwarding no +PrintMotd no +ClientAliveInterval 300 +ClientAliveCountMax 2 +MaxAuthTries 3 +LoginGraceTime 30 +AllowUsers {{ ssh_allow_users | join(' ') }} diff --git a/infra/ansible/roles/incus_host/defaults/main.yml b/infra/ansible/roles/incus_host/defaults/main.yml new file mode 100644 index 000000000..f6658722c --- /dev/null +++ b/infra/ansible/roles/incus_host/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# Bridge the Veza containers attach to. Override per environment if a +# different subnet is desired (e.g. staging on Hetzner using the cloud +# private network range). +incus_bridge: incusbr0 +incus_bridge_ipv4: 10.99.0.1/24 diff --git a/infra/ansible/roles/incus_host/handlers/main.yml b/infra/ansible/roles/incus_host/handlers/main.yml new file mode 100644 index 000000000..4490bc09b --- /dev/null +++ b/infra/ansible/roles/incus_host/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Update apt cache after Incus repo add + ansible.builtin.apt: + update_cache: true + changed_when: false + +- name: Restart incus + ansible.builtin.service: + name: incus + state: restarted diff --git a/infra/ansible/roles/incus_host/tasks/main.yml b/infra/ansible/roles/incus_host/tasks/main.yml new file mode 100644 index 000000000..f6088a6e3 --- /dev/null +++ b/infra/ansible/roles/incus_host/tasks/main.yml @@ -0,0 +1,101 @@ +# Incus host role — installs Incus from the upstream zabbly repo +# (Ubuntu 22.04+) and stages the network bridge that Veza +# containers attach to. +# +# v1.0.9 Day 5: bare bones (install + bridge + first-time init). +# Postgres / Redis / MinIO / RabbitMQ / Veza app containers land in +# their own roles (W2-W4) and reference `incus_bridge` here. +# +# Idempotent — running on a host that already has Incus reuses +# the existing config rather than re-initialising. +--- +- name: Install zabbly Incus repo signing key + ansible.builtin.get_url: + url: https://pkgs.zabbly.com/key.asc + dest: /etc/apt/keyrings/zabbly.asc + mode: "0644" + force: false + tags: [incus, packages] + +- name: Add zabbly Incus apt source + ansible.builtin.copy: + dest: /etc/apt/sources.list.d/zabbly-incus-stable.sources + owner: root + group: root + mode: "0644" + content: | + Enabled: yes + Types: deb + URIs: https://pkgs.zabbly.com/incus/stable + Suites: {{ ansible_distribution_release }} + Components: main + Architectures: {{ ansible_architecture | replace('x86_64', 'amd64') }} + Signed-By: /etc/apt/keyrings/zabbly.asc + notify: Update apt cache after Incus repo add + tags: [incus, packages] + +- name: Update apt cache (Incus repo just added) + ansible.builtin.apt: + update_cache: true + changed_when: false + tags: [incus, packages] + +- name: Install Incus packages + ansible.builtin.apt: + name: + - incus + - incus-client + state: present + tags: [incus, packages] + +- name: Ensure ansible user is in the incus-admin group (lets it run `incus` non-sudo) + ansible.builtin.user: + name: ansible + groups: incus-admin + append: true + tags: [incus, users] + +- name: Check whether Incus is already initialised + ansible.builtin.command: incus list + register: incus_init_check + changed_when: false + failed_when: false + check_mode: false + tags: [incus, init] + +- name: First-time Incus init via preseed (only when not initialised) + ansible.builtin.shell: + cmd: | + cat <