From 02ce938b3fc0440bf87e956a56e928bf218dbdb5 Mon Sep 17 00:00:00 2001 From: senke Date: Wed, 29 Apr 2026 12:25:06 +0200 Subject: [PATCH] =?UTF-8?q?feat(ansible):=20playbooks/deploy=5Fapp.yml=20?= =?UTF-8?q?=E2=80=94=20full=20blue/green=20sequence?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end orchestrator for the app-tier deploy. Ties together the roles + playbooks landed in earlier commits : Phase A — migrations (incus_hosts → tools container) Ensure `backend-tools` container exists (idempotent create), apt-deps + pull backend tarball + run `migrate_tool --up` against postgres.lxd. no_log on the DATABASE_URL line (carries vault_postgres_password). Phase B — determine inactive color (haproxy container) slurp /var/lib/veza/active-color, default 'blue' if absent. inactive_color = the OTHER one — the one we deploy TO. Both prior_active_color and inactive_color exposed as cacheable hostvars for downstream phases. Phase C — recreate inactive containers (host-side + per-container roles) Host play: incus delete --force + incus launch for each of {backend,stream,web}-{inactive} ; refresh_inventory. Then three per-container plays apply roles/veza_app with component-specific vars (the `tools` container shape was designed for this). Each role pass ends with an in-container health probe — failure here fails the playbook before HAProxy is touched. Phase D — cross-container probes (haproxy container) Curl each component's Incus DNS name from inside the HAProxy container. Catches the "service is up but unreachable via Incus DNS" failure mode the in-container probe misses. Phase E — switch HAProxy (haproxy container) Apply roles/veza_haproxy_switch with veza_active_color = inactive_color. The role's block/rescue handles validate-fail or HUP-fail by restoring the previous cfg. Phase F — verify externally + record deploy state Curl {{ veza_public_url }}/api/v1/health through HAProxy with retries (10×3s). On success, write a Prometheus textfile- collector file (active_color, release_sha, last_success_ts). On failure: write a failure_ts file, re-switch HAProxy back to prior_active_color via a second invocation of the switch role, and fail the playbook with a journalctl one-liner the operator can paste to inspect logs. Why phase F doesn't destroy the failed inactive containers: per the user's choice (ask earlier in the design memo), failed containers are kept alive for `incus exec ... journalctl`. The manual cleanup_failed.yml workflow tears them down explicitly. Edge cases this handles: * No prior active-color file (first-ever deploy) → defaults to blue, deploys to green. * Tools container missing (first-ever deploy or someone deleted it) → recreate idempotently. * Migration that returns "no changes" (already-applied) → changed=false, no spurious notifications. * inactive_color spelled differently across plays → all derive from a single hostvar set in Phase B. --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) --- infra/ansible/playbooks/deploy_app.yml | 355 +++++++++++++++++++++++++ 1 file changed, 355 insertions(+) create mode 100644 infra/ansible/playbooks/deploy_app.yml diff --git a/infra/ansible/playbooks/deploy_app.yml b/infra/ansible/playbooks/deploy_app.yml new file mode 100644 index 000000000..030bccf72 --- /dev/null +++ b/infra/ansible/playbooks/deploy_app.yml @@ -0,0 +1,355 @@ +# deploy_app.yml — second-half of every deploy. Runs AFTER +# deploy_data.yml has snapshot + ensured data services up. +# +# Phases (mirror docs/RUNBOOK_ROLLBACK.md): +# A — Run migrations in an ephemeral tools container. +# B — Read /var/lib/veza/active-color in the HAProxy container, +# compute inactive_color (the color we are deploying TO). +# C — Destroy + relaunch the three app containers in inactive_color. +# Apply roles/veza_app per component (artefact install + health +# probe). +# D — Implicit in C: veza_app role's probe.yml runs. If any color's +# probe fails, the playbook errors and Phase E is skipped (HAProxy +# still pointing at the prior active color). +# E — Switch HAProxy via roles/veza_haproxy_switch (block/rescue +# guards prior cfg). +# F — External verification : curl through HAProxy, fail the playbook +# (and reverse-switch) if the public health endpoint is < 200. +# +# Required extra-vars: +# env staging | prod +# release_sha 40-char git SHA +--- +# ===================================================================== +# Phase A — Migrations +# ===================================================================== +- name: Phase A — apply database migrations + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Validate inputs + ansible.builtin.assert: + that: + - veza_env in ['staging', 'prod'] + - veza_release_sha | length == 40 + fail_msg: deploy_app.yml requires veza_env + veza_release_sha extra-vars. + quiet: true + + - name: Ensure ephemeral tools container exists + ansible.builtin.shell: | + set -e + TOOLS="{{ veza_container_prefix }}backend-tools" + if ! incus info "$TOOLS" >/dev/null 2>&1; then + incus launch {{ veza_app_base_image }} "$TOOLS" \ + --profile veza-app --profile veza-net \ + --network "{{ veza_incus_network }}" + for i in $(seq 1 30); do + incus exec "$TOOLS" -- /bin/true 2>/dev/null && exit 0 + sleep 1 + done + echo "tools container did not become ready" + exit 1 + fi + args: + executable: /bin/bash + register: tools_provision + changed_when: "'incus launch' in (tools_provision.stdout | default(''))" + tags: [phaseA, migrations] + + - name: Refresh inventory so the tools container becomes reachable + ansible.builtin.meta: refresh_inventory + tags: [phaseA] + +- name: Phase A — install backend artifact + run migrate_tool inside tools + hosts: "{{ veza_container_prefix + 'backend-tools' }}" + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_component: backend + veza_target_color: tools # not blue/green — bypass color logic in name + tasks: + - name: Apt deps for tools container + ansible.builtin.apt: + name: + - ca-certificates + - curl + - postgresql-client + - libssl3 + - zstd + state: present + update_cache: true + cache_valid_time: 3600 + + - name: Ensure migrate user + ansible.builtin.user: + name: veza-migrate + system: true + shell: /usr/sbin/nologin + + - name: Ensure /opt/veza/migrate + ansible.builtin.file: + path: /opt/veza/migrate + state: directory + owner: veza-migrate + mode: "0755" + + - name: Fetch backend tarball + ansible.builtin.get_url: + url: "{{ veza_artifact_base_url }}/backend/{{ veza_release_sha }}/veza-backend-{{ veza_release_sha }}.tar.zst" + dest: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst" + mode: "0600" + headers: + Authorization: "token {{ vault_forgejo_registry_token | default('') }}" + force: false + + - name: Extract tarball into /opt/veza/migrate + ansible.builtin.unarchive: + src: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst" + dest: "/opt/veza/migrate" + remote_src: true + owner: veza-migrate + creates: "/opt/veza/migrate/migrate_tool" + + - name: Run migrate_tool + ansible.builtin.command: /opt/veza/migrate/migrate_tool --up + environment: + DATABASE_URL: "postgres://veza:{{ vault_postgres_password }}@{{ veza_container_prefix }}postgres.{{ veza_incus_dns_suffix }}:5432/veza?sslmode=disable" + register: migrate_result + changed_when: "'no changes' not in (migrate_result.stdout | default('').lower())" + no_log: true # DATABASE_URL contains the password + tags: [phaseA, migrations] + +# ===================================================================== +# Phase B — Determine inactive color +# ===================================================================== +- name: Phase B — read active color, compute inactive_color + hosts: "{{ veza_container_prefix + 'haproxy' }}" + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + tasks: + - name: Read currently-active color + ansible.builtin.slurp: + src: /var/lib/veza/active-color + register: prior_color_raw + failed_when: false + + - name: Resolve prior_active_color (default blue if no history) + ansible.builtin.set_fact: + prior_active_color: >- + {{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined + else 'blue' }} + cacheable: true + + - name: Compute inactive_color (the one we deploy TO) + ansible.builtin.set_fact: + inactive_color: "{{ 'green' if prior_active_color == 'blue' else 'blue' }}" + cacheable: true + + - name: Show what we are switching to + ansible.builtin.debug: + msg: >- + Deploying SHA {{ veza_release_sha[:12] }} to color + {{ inactive_color }} (currently active: {{ prior_active_color }}). + +# ===================================================================== +# Phase C — destroy + relaunch the three app containers in inactive_color +# ===================================================================== +- name: Phase C — recreate inactive-color app containers (host-side) + hosts: incus_hosts + become: true + gather_facts: false + vars: + inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + tasks: + - name: Destroy + launch each component container + ansible.builtin.shell: | + set -e + CT="{{ veza_container_prefix }}{{ item }}-{{ inactive_color }}" + # Force-delete is fine — these are stateless app containers ; the + # active color is untouched. + incus delete --force "$CT" 2>/dev/null || true + incus launch {{ veza_app_base_image }} "$CT" \ + --profile veza-app \ + --profile veza-net \ + --network "{{ veza_incus_network }}" + for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do + if incus exec "$CT" -- /bin/true 2>/dev/null; then + exit 0 + fi + sleep 1 + done + echo "Container $CT did not become ready" + exit 1 + args: + executable: /bin/bash + loop: + - backend + - stream + - web + changed_when: true + tags: [phaseC] + + - name: Refresh inventory so freshly-launched containers become reachable + ansible.builtin.meta: refresh_inventory + tags: [phaseC] + +- name: Phase C — provision backend (inactive color) via veza_app role + hosts: "{{ veza_container_prefix + 'backend-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_component: backend + veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + roles: + - veza_app + tags: [phaseC, backend] + +- name: Phase C — provision stream (inactive color) + hosts: "{{ veza_container_prefix + 'stream-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_component: stream + veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + roles: + - veza_app + tags: [phaseC, stream] + +- name: Phase C — provision web (inactive color) + hosts: "{{ veza_container_prefix + 'web-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_component: web + veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + roles: + - veza_app + tags: [phaseC, web] + +# ===================================================================== +# Phase D — cross-container probes (in addition to in-container probes +# that veza_app already ran). This catches the case where the service +# is up locally but unreachable via Incus DNS. +# ===================================================================== +- name: Phase D — probe each component via Incus DNS (cross-container) + hosts: "{{ veza_container_prefix + 'haproxy' }}" + become: true + gather_facts: false + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + tasks: + - name: Curl each component's health endpoint + ansible.builtin.uri: + url: "http://{{ veza_container_prefix }}{{ item.component }}-{{ inactive_color }}.{{ veza_incus_dns_suffix }}:{{ item.port }}{{ item.path }}" + method: GET + status_code: [200] + timeout: 5 + register: cross_probe + retries: "{{ veza_healthcheck_retries }}" + delay: "{{ veza_healthcheck_delay_seconds }}" + until: cross_probe.status == 200 + changed_when: false + loop: + - { component: backend, port: "{{ veza_backend_port }}", path: "{{ veza_healthcheck_paths.backend }}" } + - { component: stream, port: "{{ veza_stream_port }}", path: "{{ veza_healthcheck_paths.stream }}" } + - { component: web, port: "{{ veza_web_port }}", path: "{{ veza_healthcheck_paths.web }}" } + tags: [phaseD, probe] + +# ===================================================================== +# Phase E — switch HAProxy. roles/veza_haproxy_switch wraps render + +# validate + atomic-swap + HUP in a block/rescue that restores prior +# cfg on failure. +# ===================================================================== +- name: Phase E — switch HAProxy to the new color + hosts: "{{ veza_container_prefix + 'haproxy' }}" + become: true + gather_facts: true # roles/veza_haproxy_switch wants ansible_date_time + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_active_color: "{{ inactive_color }}" # the color we ARE switching TO + roles: + - veza_haproxy_switch + tags: [phaseE, switch] + +# ===================================================================== +# Phase F — Post-deploy verification (external curl through HAProxy). +# If this fails, we revert HAProxy to the prior color via a second run +# of veza_haproxy_switch and fail the playbook. +# ===================================================================== +- name: Phase F — verify externally + record deploy state + hosts: incus_hosts + become: true + gather_facts: true + vars: + inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}" + prior_active_color: "{{ hostvars[veza_container_prefix + 'haproxy']['prior_active_color'] }}" + tasks: + - name: Curl public health endpoint via HAProxy + ansible.builtin.uri: + url: "{{ veza_public_url }}/api/v1/health" + method: GET + status_code: [200] + timeout: 10 + validate_certs: "{{ veza_public_url.startswith('https://') }}" + register: public_health + retries: 10 + delay: 3 + until: public_health.status == 200 + tags: [phaseF, verify] + + - name: Write deploy-state.json (consumed by node-exporter textfile) + ansible.builtin.copy: + dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom + content: | + # HELP veza_deploy_active_color 0=blue, 1=green. + # TYPE veza_deploy_active_color gauge + veza_deploy_active_color{env="{{ veza_env }}"} {{ 0 if inactive_color == 'blue' else 1 }} + # HELP veza_deploy_release_sha info metric, label=sha. + # TYPE veza_deploy_release_sha gauge + veza_deploy_release_sha{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} 1 + # HELP veza_deploy_last_success_timestamp unix epoch of last successful deploy. + # TYPE veza_deploy_last_success_timestamp gauge + veza_deploy_last_success_timestamp{env="{{ veza_env }}"} {{ ansible_date_time.epoch }} + mode: "0644" + tags: [phaseF, metrics] + rescue: + - name: Public health failed — record the failure timestamp + ansible.builtin.copy: + dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom + content: | + # HELP veza_deploy_last_failure_timestamp unix epoch of last failed deploy. + # TYPE veza_deploy_last_failure_timestamp gauge + veza_deploy_last_failure_timestamp{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} {{ ansible_date_time.epoch }} + mode: "0644" + failed_when: false + + - name: Re-switch HAProxy back to the prior color + ansible.builtin.import_role: + name: veza_haproxy_switch + vars: + veza_active_color: "{{ prior_active_color }}" + delegate_to: "{{ veza_container_prefix + 'haproxy' }}" + + - name: Fail the playbook + ansible.builtin.fail: + msg: >- + Public health probe via HAProxy failed after deploy of SHA + {{ veza_release_sha[:12] }} to color {{ inactive_color }}. + HAProxy reverted to the prior color ({{ prior_active_color }}). + The freshly-deployed {{ inactive_color }} containers are kept + alive for forensics — inspect with: + incus exec {{ veza_container_prefix }}backend-{{ inactive_color }} -- journalctl -u veza-backend -n 200