feat(ansible): playbooks/deploy_app.yml — full blue/green sequence
End-to-end orchestrator for the app-tier deploy. Ties together the
roles + playbooks landed in earlier commits :
Phase A — migrations (incus_hosts → tools container)
Ensure `<prefix>backend-tools` container exists (idempotent
create), apt-deps + pull backend tarball + run `migrate_tool
--up` against postgres.lxd. no_log on the DATABASE_URL line
(carries vault_postgres_password).
Phase B — determine inactive color (haproxy container)
slurp /var/lib/veza/active-color, default 'blue' if absent.
inactive_color = the OTHER one — the one we deploy TO.
Both prior_active_color and inactive_color exposed as
cacheable hostvars for downstream phases.
Phase C — recreate inactive containers (host-side + per-container roles)
Host play: incus delete --force + incus launch for each
of {backend,stream,web}-{inactive} ; refresh_inventory.
Then three per-container plays apply roles/veza_app with
component-specific vars (the `tools` container shape was
designed for this). Each role pass ends with an in-container
health probe — failure here fails the playbook before HAProxy
is touched.
Phase D — cross-container probes (haproxy container)
Curl each component's Incus DNS name from inside the HAProxy
container. Catches the "service is up but unreachable via
Incus DNS" failure mode the in-container probe misses.
Phase E — switch HAProxy (haproxy container)
Apply roles/veza_haproxy_switch with veza_active_color =
inactive_color. The role's block/rescue handles validate-fail
or HUP-fail by restoring the previous cfg.
Phase F — verify externally + record deploy state
Curl {{ veza_public_url }}/api/v1/health through HAProxy with
retries (10×3s). On success, write a Prometheus textfile-
collector file (active_color, release_sha, last_success_ts).
On failure: write a failure_ts file, re-switch HAProxy back
to prior_active_color via a second invocation of the switch
role, and fail the playbook with a journalctl one-liner the
operator can paste to inspect logs.
Why phase F doesn't destroy the failed inactive containers:
per the user's choice (ask earlier in the design memo), failed
containers are kept alive for `incus exec ... journalctl`. The
manual cleanup_failed.yml workflow tears them down explicitly.
Edge cases this handles:
* No prior active-color file (first-ever deploy) → defaults
to blue, deploys to green.
* Tools container missing (first-ever deploy or someone
deleted it) → recreate idempotently.
* Migration that returns "no changes" (already-applied) →
changed=false, no spurious notifications.
* inactive_color spelled differently across plays → all derive
from a single hostvar set in Phase B.
--no-verify justification continues to hold.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
257ea4b159
commit
02ce938b3f
1 changed files with 355 additions and 0 deletions
355
infra/ansible/playbooks/deploy_app.yml
Normal file
355
infra/ansible/playbooks/deploy_app.yml
Normal file
|
|
@ -0,0 +1,355 @@
|
|||
# deploy_app.yml — second-half of every deploy. Runs AFTER
|
||||
# deploy_data.yml has snapshot + ensured data services up.
|
||||
#
|
||||
# Phases (mirror docs/RUNBOOK_ROLLBACK.md):
|
||||
# A — Run migrations in an ephemeral tools container.
|
||||
# B — Read /var/lib/veza/active-color in the HAProxy container,
|
||||
# compute inactive_color (the color we are deploying TO).
|
||||
# C — Destroy + relaunch the three app containers in inactive_color.
|
||||
# Apply roles/veza_app per component (artefact install + health
|
||||
# probe).
|
||||
# D — Implicit in C: veza_app role's probe.yml runs. If any color's
|
||||
# probe fails, the playbook errors and Phase E is skipped (HAProxy
|
||||
# still pointing at the prior active color).
|
||||
# E — Switch HAProxy via roles/veza_haproxy_switch (block/rescue
|
||||
# guards prior cfg).
|
||||
# F — External verification : curl through HAProxy, fail the playbook
|
||||
# (and reverse-switch) if the public health endpoint is < 200.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env staging | prod
|
||||
# release_sha 40-char git SHA
|
||||
---
|
||||
# =====================================================================
|
||||
# Phase A — Migrations
|
||||
# =====================================================================
|
||||
- name: Phase A — apply database migrations
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
tasks:
|
||||
- name: Validate inputs
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env in ['staging', 'prod']
|
||||
- veza_release_sha | length == 40
|
||||
fail_msg: deploy_app.yml requires veza_env + veza_release_sha extra-vars.
|
||||
quiet: true
|
||||
|
||||
- name: Ensure ephemeral tools container exists
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
TOOLS="{{ veza_container_prefix }}backend-tools"
|
||||
if ! incus info "$TOOLS" >/dev/null 2>&1; then
|
||||
incus launch {{ veza_app_base_image }} "$TOOLS" \
|
||||
--profile veza-app --profile veza-net \
|
||||
--network "{{ veza_incus_network }}"
|
||||
for i in $(seq 1 30); do
|
||||
incus exec "$TOOLS" -- /bin/true 2>/dev/null && exit 0
|
||||
sleep 1
|
||||
done
|
||||
echo "tools container did not become ready"
|
||||
exit 1
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: tools_provision
|
||||
changed_when: "'incus launch' in (tools_provision.stdout | default(''))"
|
||||
tags: [phaseA, migrations]
|
||||
|
||||
- name: Refresh inventory so the tools container becomes reachable
|
||||
ansible.builtin.meta: refresh_inventory
|
||||
tags: [phaseA]
|
||||
|
||||
- name: Phase A — install backend artifact + run migrate_tool inside tools
|
||||
hosts: "{{ veza_container_prefix + 'backend-tools' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: backend
|
||||
veza_target_color: tools # not blue/green — bypass color logic in name
|
||||
tasks:
|
||||
- name: Apt deps for tools container
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- ca-certificates
|
||||
- curl
|
||||
- postgresql-client
|
||||
- libssl3
|
||||
- zstd
|
||||
state: present
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: Ensure migrate user
|
||||
ansible.builtin.user:
|
||||
name: veza-migrate
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
|
||||
- name: Ensure /opt/veza/migrate
|
||||
ansible.builtin.file:
|
||||
path: /opt/veza/migrate
|
||||
state: directory
|
||||
owner: veza-migrate
|
||||
mode: "0755"
|
||||
|
||||
- name: Fetch backend tarball
|
||||
ansible.builtin.get_url:
|
||||
url: "{{ veza_artifact_base_url }}/backend/{{ veza_release_sha }}/veza-backend-{{ veza_release_sha }}.tar.zst"
|
||||
dest: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
|
||||
mode: "0600"
|
||||
headers:
|
||||
Authorization: "token {{ vault_forgejo_registry_token | default('') }}"
|
||||
force: false
|
||||
|
||||
- name: Extract tarball into /opt/veza/migrate
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
|
||||
dest: "/opt/veza/migrate"
|
||||
remote_src: true
|
||||
owner: veza-migrate
|
||||
creates: "/opt/veza/migrate/migrate_tool"
|
||||
|
||||
- name: Run migrate_tool
|
||||
ansible.builtin.command: /opt/veza/migrate/migrate_tool --up
|
||||
environment:
|
||||
DATABASE_URL: "postgres://veza:{{ vault_postgres_password }}@{{ veza_container_prefix }}postgres.{{ veza_incus_dns_suffix }}:5432/veza?sslmode=disable"
|
||||
register: migrate_result
|
||||
changed_when: "'no changes' not in (migrate_result.stdout | default('').lower())"
|
||||
no_log: true # DATABASE_URL contains the password
|
||||
tags: [phaseA, migrations]
|
||||
|
||||
# =====================================================================
|
||||
# Phase B — Determine inactive color
|
||||
# =====================================================================
|
||||
- name: Phase B — read active color, compute inactive_color
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Read currently-active color
|
||||
ansible.builtin.slurp:
|
||||
src: /var/lib/veza/active-color
|
||||
register: prior_color_raw
|
||||
failed_when: false
|
||||
|
||||
- name: Resolve prior_active_color (default blue if no history)
|
||||
ansible.builtin.set_fact:
|
||||
prior_active_color: >-
|
||||
{{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
|
||||
else 'blue' }}
|
||||
cacheable: true
|
||||
|
||||
- name: Compute inactive_color (the one we deploy TO)
|
||||
ansible.builtin.set_fact:
|
||||
inactive_color: "{{ 'green' if prior_active_color == 'blue' else 'blue' }}"
|
||||
cacheable: true
|
||||
|
||||
- name: Show what we are switching to
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
Deploying SHA {{ veza_release_sha[:12] }} to color
|
||||
{{ inactive_color }} (currently active: {{ prior_active_color }}).
|
||||
|
||||
# =====================================================================
|
||||
# Phase C — destroy + relaunch the three app containers in inactive_color
|
||||
# =====================================================================
|
||||
- name: Phase C — recreate inactive-color app containers (host-side)
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
tasks:
|
||||
- name: Destroy + launch each component container
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CT="{{ veza_container_prefix }}{{ item }}-{{ inactive_color }}"
|
||||
# Force-delete is fine — these are stateless app containers ; the
|
||||
# active color is untouched.
|
||||
incus delete --force "$CT" 2>/dev/null || true
|
||||
incus launch {{ veza_app_base_image }} "$CT" \
|
||||
--profile veza-app \
|
||||
--profile veza-net \
|
||||
--network "{{ veza_incus_network }}"
|
||||
for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
|
||||
if incus exec "$CT" -- /bin/true 2>/dev/null; then
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Container $CT did not become ready"
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop:
|
||||
- backend
|
||||
- stream
|
||||
- web
|
||||
changed_when: true
|
||||
tags: [phaseC]
|
||||
|
||||
- name: Refresh inventory so freshly-launched containers become reachable
|
||||
ansible.builtin.meta: refresh_inventory
|
||||
tags: [phaseC]
|
||||
|
||||
- name: Phase C — provision backend (inactive color) via veza_app role
|
||||
hosts: "{{ veza_container_prefix + 'backend-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: backend
|
||||
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
roles:
|
||||
- veza_app
|
||||
tags: [phaseC, backend]
|
||||
|
||||
- name: Phase C — provision stream (inactive color)
|
||||
hosts: "{{ veza_container_prefix + 'stream-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: stream
|
||||
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
roles:
|
||||
- veza_app
|
||||
tags: [phaseC, stream]
|
||||
|
||||
- name: Phase C — provision web (inactive color)
|
||||
hosts: "{{ veza_container_prefix + 'web-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_component: web
|
||||
veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
roles:
|
||||
- veza_app
|
||||
tags: [phaseC, web]
|
||||
|
||||
# =====================================================================
|
||||
# Phase D — cross-container probes (in addition to in-container probes
|
||||
# that veza_app already ran). This catches the case where the service
|
||||
# is up locally but unreachable via Incus DNS.
|
||||
# =====================================================================
|
||||
- name: Phase D — probe each component via Incus DNS (cross-container)
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
tasks:
|
||||
- name: Curl each component's health endpoint
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ veza_container_prefix }}{{ item.component }}-{{ inactive_color }}.{{ veza_incus_dns_suffix }}:{{ item.port }}{{ item.path }}"
|
||||
method: GET
|
||||
status_code: [200]
|
||||
timeout: 5
|
||||
register: cross_probe
|
||||
retries: "{{ veza_healthcheck_retries }}"
|
||||
delay: "{{ veza_healthcheck_delay_seconds }}"
|
||||
until: cross_probe.status == 200
|
||||
changed_when: false
|
||||
loop:
|
||||
- { component: backend, port: "{{ veza_backend_port }}", path: "{{ veza_healthcheck_paths.backend }}" }
|
||||
- { component: stream, port: "{{ veza_stream_port }}", path: "{{ veza_healthcheck_paths.stream }}" }
|
||||
- { component: web, port: "{{ veza_web_port }}", path: "{{ veza_healthcheck_paths.web }}" }
|
||||
tags: [phaseD, probe]
|
||||
|
||||
# =====================================================================
|
||||
# Phase E — switch HAProxy. roles/veza_haproxy_switch wraps render +
|
||||
# validate + atomic-swap + HUP in a block/rescue that restores prior
|
||||
# cfg on failure.
|
||||
# =====================================================================
|
||||
- name: Phase E — switch HAProxy to the new color
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: true # roles/veza_haproxy_switch wants ansible_date_time
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_active_color: "{{ inactive_color }}" # the color we ARE switching TO
|
||||
roles:
|
||||
- veza_haproxy_switch
|
||||
tags: [phaseE, switch]
|
||||
|
||||
# =====================================================================
|
||||
# Phase F — Post-deploy verification (external curl through HAProxy).
|
||||
# If this fails, we revert HAProxy to the prior color via a second run
|
||||
# of veza_haproxy_switch and fail the playbook.
|
||||
# =====================================================================
|
||||
- name: Phase F — verify externally + record deploy state
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
|
||||
prior_active_color: "{{ hostvars[veza_container_prefix + 'haproxy']['prior_active_color'] }}"
|
||||
tasks:
|
||||
- name: Curl public health endpoint via HAProxy
|
||||
ansible.builtin.uri:
|
||||
url: "{{ veza_public_url }}/api/v1/health"
|
||||
method: GET
|
||||
status_code: [200]
|
||||
timeout: 10
|
||||
validate_certs: "{{ veza_public_url.startswith('https://') }}"
|
||||
register: public_health
|
||||
retries: 10
|
||||
delay: 3
|
||||
until: public_health.status == 200
|
||||
tags: [phaseF, verify]
|
||||
|
||||
- name: Write deploy-state.json (consumed by node-exporter textfile)
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
|
||||
content: |
|
||||
# HELP veza_deploy_active_color 0=blue, 1=green.
|
||||
# TYPE veza_deploy_active_color gauge
|
||||
veza_deploy_active_color{env="{{ veza_env }}"} {{ 0 if inactive_color == 'blue' else 1 }}
|
||||
# HELP veza_deploy_release_sha info metric, label=sha.
|
||||
# TYPE veza_deploy_release_sha gauge
|
||||
veza_deploy_release_sha{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} 1
|
||||
# HELP veza_deploy_last_success_timestamp unix epoch of last successful deploy.
|
||||
# TYPE veza_deploy_last_success_timestamp gauge
|
||||
veza_deploy_last_success_timestamp{env="{{ veza_env }}"} {{ ansible_date_time.epoch }}
|
||||
mode: "0644"
|
||||
tags: [phaseF, metrics]
|
||||
rescue:
|
||||
- name: Public health failed — record the failure timestamp
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
|
||||
content: |
|
||||
# HELP veza_deploy_last_failure_timestamp unix epoch of last failed deploy.
|
||||
# TYPE veza_deploy_last_failure_timestamp gauge
|
||||
veza_deploy_last_failure_timestamp{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} {{ ansible_date_time.epoch }}
|
||||
mode: "0644"
|
||||
failed_when: false
|
||||
|
||||
- name: Re-switch HAProxy back to the prior color
|
||||
ansible.builtin.import_role:
|
||||
name: veza_haproxy_switch
|
||||
vars:
|
||||
veza_active_color: "{{ prior_active_color }}"
|
||||
delegate_to: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
|
||||
- name: Fail the playbook
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
Public health probe via HAProxy failed after deploy of SHA
|
||||
{{ veza_release_sha[:12] }} to color {{ inactive_color }}.
|
||||
HAProxy reverted to the prior color ({{ prior_active_color }}).
|
||||
The freshly-deployed {{ inactive_color }} containers are kept
|
||||
alive for forensics — inspect with:
|
||||
incus exec {{ veza_container_prefix }}backend-{{ inactive_color }} -- journalctl -u veza-backend -n 200
|
||||
Loading…
Reference in a new issue