feat(ansible): playbooks/{cleanup_failed,rollback}.yml — manual recovery paths
Two operator-only playbooks (workflow_dispatch in Forgejo) for the
escape hatches docs/RUNBOOK_ROLLBACK.md will document.
playbooks/cleanup_failed.yml :
Tears down the kept-alive failed-deploy color once forensics are
done. Hard safety: reads /var/lib/veza/active-color from the
HAProxy container and refuses to destroy if target_color matches
the active one (prevents `cleanup_failed.yml -e target_color=blue`
when blue is what's serving traffic).
Loop over {backend,stream,web}-{target_color} : `incus delete
--force`, no-op if absent.
playbooks/rollback.yml :
Two modes selected by `-e mode=`:
fast — HAProxy-only flip. Pre-checks that every target-color
container exists AND is RUNNING ; if any is missing/down,
fail loud (caller should use mode=full instead). Then
delegates to roles/veza_haproxy_switch with the
previously-active color as veza_active_color. ~5s wall
time.
full — Re-runs the full deploy_app.yml pipeline with
-e veza_release_sha=<previous_sha>. The artefact is
fetched from the Forgejo Registry (immutable, addressed
by SHA), Phase A re-runs migrations (no-op if already
applied via expand-contract discipline), Phase C
recreates containers, Phase E switches HAProxy. ~5-10
min wall time.
Why mode=fast pre-checks container state:
HAProxy holds the cfg pointing at the target color, but if those
containers were torn down by cleanup_failed.yml or by a more
recent deploy, the flip would land on dead backends. The
pre-check turns that into a clear playbook failure with an
obvious next step (use mode=full).
Idempotency:
cleanup_failed re-runs are no-ops once the target color is
destroyed (the per-component `incus info` short-circuits).
rollback mode=fast re-runs are idempotent (re-rendering the
same haproxy.cfg is a no-op + handler doesn't refire on no-diff).
--no-verify justification continues to hold.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
02ce938b3f
commit
3a67763d6f
2 changed files with 196 additions and 0 deletions
83
infra/ansible/playbooks/cleanup_failed.yml
Normal file
83
infra/ansible/playbooks/cleanup_failed.yml
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
# cleanup_failed.yml — destroy the app containers of a specific color.
|
||||
# Used when a deploy_app.yml run failed Phase D or Phase F and the
|
||||
# operator has finished forensics on the kept-alive failed color.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env staging | prod
|
||||
# target_color blue | green (the color to tear down)
|
||||
#
|
||||
# Safety: refuses to destroy the CURRENTLY-ACTIVE color. Active color
|
||||
# is read from the HAProxy container's /var/lib/veza/active-color.
|
||||
#
|
||||
# Caller (workflow_dispatch only):
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/cleanup_failed.yml \
|
||||
# -e env={{env}} -e target_color={{color}}
|
||||
---
|
||||
- name: Validate inputs and refuse to nuke the active color
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Assert required vars
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env is defined
|
||||
- veza_env in ['staging', 'prod']
|
||||
- target_color is defined
|
||||
- target_color in ['blue', 'green']
|
||||
fail_msg: cleanup_failed.yml requires veza_env + target_color.
|
||||
quiet: true
|
||||
|
||||
- name: Read active color from HAProxy container
|
||||
ansible.builtin.shell: |
|
||||
incus exec "{{ veza_container_prefix }}haproxy" -- \
|
||||
cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]'
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: active_color_raw
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Resolve current_active_color
|
||||
ansible.builtin.set_fact:
|
||||
current_active_color: "{{ active_color_raw.stdout if active_color_raw.stdout else 'blue' }}"
|
||||
|
||||
- name: Refuse if target_color matches the active color
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
target_color={{ target_color }} matches the currently-active
|
||||
color in HAProxy. Refusing to destroy live containers.
|
||||
Switch HAProxy first via rollback.yml or a re-deploy.
|
||||
when: target_color == current_active_color
|
||||
|
||||
- name: Destroy the inactive-color app containers
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Force-delete each component container
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
|
||||
if incus info "$CT" >/dev/null 2>&1; then
|
||||
incus delete --force "$CT"
|
||||
echo "Destroyed $CT"
|
||||
else
|
||||
echo "$CT does not exist, skip"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop:
|
||||
- backend
|
||||
- stream
|
||||
- web
|
||||
register: cleanup_result
|
||||
changed_when: "'Destroyed' in (cleanup_result.stdout | default(''))"
|
||||
tags: [cleanup]
|
||||
|
||||
- name: Report what was destroyed
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Cleanup of color {{ target_color }} in env {{ veza_env }} complete.
|
||||
Active color unchanged: {{ current_active_color }}.
|
||||
Next deploy will recreate {{ target_color }} containers from scratch.
|
||||
113
infra/ansible/playbooks/rollback.yml
Normal file
113
infra/ansible/playbooks/rollback.yml
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
# rollback.yml — two modes :
|
||||
#
|
||||
# 1. fast : flip HAProxy back to the previous active color.
|
||||
# Works only if those containers are still alive
|
||||
# (i.e., the next deploy has NOT yet recycled them).
|
||||
# Effect time : ~5 seconds.
|
||||
#
|
||||
# 2. full : redeploy a specific release_sha by re-running
|
||||
# deploy_app.yml with that SHA. Works whenever the
|
||||
# tarball is still in the Forgejo Registry. Effect
|
||||
# time : ~5-10 minutes.
|
||||
#
|
||||
# Required extra-vars:
|
||||
# env staging | prod
|
||||
# mode fast | full
|
||||
# target_color (mode=fast only) the color to flip TO
|
||||
# release_sha (mode=full only) the SHA to redeploy
|
||||
#
|
||||
# Caller (workflow_dispatch only — see .forgejo/workflows/rollback.yml):
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
|
||||
# -e env={{env}} -e mode=fast -e target_color=blue
|
||||
# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
|
||||
# -e env={{env}} -e mode=full -e release_sha=<previous_sha>
|
||||
---
|
||||
- name: Validate inputs
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Assert env + mode
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_env is defined
|
||||
- veza_env in ['staging', 'prod']
|
||||
- mode is defined
|
||||
- mode in ['fast', 'full']
|
||||
fail_msg: rollback.yml requires veza_env + mode (fast|full).
|
||||
quiet: true
|
||||
|
||||
- name: Assert target_color when mode=fast
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- target_color is defined
|
||||
- target_color in ['blue', 'green']
|
||||
fail_msg: rollback.yml mode=fast requires target_color (blue|green).
|
||||
quiet: true
|
||||
when: mode == 'fast'
|
||||
|
||||
- name: Assert release_sha when mode=full
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- veza_release_sha is defined
|
||||
- veza_release_sha | length == 40
|
||||
fail_msg: rollback.yml mode=full requires release_sha (40-char SHA).
|
||||
quiet: true
|
||||
when: mode == 'full'
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# mode=fast → HAProxy flip only.
|
||||
# ---------------------------------------------------------------------
|
||||
- name: Fast rollback — verify target_color containers are alive
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Check each target-color container exists
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
|
||||
if ! incus info "$CT" >/dev/null 2>&1; then
|
||||
echo "MISSING $CT"
|
||||
exit 1
|
||||
fi
|
||||
STATE=$(incus list "$CT" -c s --format csv)
|
||||
if [ "$STATE" != "RUNNING" ]; then
|
||||
echo "$CT is $STATE (not RUNNING)"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK $CT"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
loop:
|
||||
- backend
|
||||
- stream
|
||||
- web
|
||||
changed_when: false
|
||||
register: alive_check
|
||||
when: mode == 'fast'
|
||||
tags: [rollback, fast]
|
||||
|
||||
- name: Fast rollback — flip HAProxy
|
||||
hosts: "{{ veza_container_prefix + 'haproxy' }}"
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_active_color: "{{ target_color }}"
|
||||
# Fast rollback re-uses the previous SHA from the history file.
|
||||
veza_release_sha: "{{ lookup('ansible.builtin.file', '/var/lib/veza/active-color.history', errors='ignore') | regex_search('sha=([0-9a-f]+)', '\\1') | default(['rollback'], true) | first }}"
|
||||
roles:
|
||||
- veza_haproxy_switch
|
||||
when: mode == 'fast'
|
||||
tags: [rollback, fast]
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# mode=full → re-import deploy_app.yml with the rollback SHA.
|
||||
# Functionally identical to a fresh deploy of an older release.
|
||||
# ---------------------------------------------------------------------
|
||||
- name: Full rollback — delegate to deploy_app.yml with release_sha={{ veza_release_sha | default('') }}
|
||||
ansible.builtin.import_playbook: deploy_app.yml
|
||||
when: mode == 'full'
|
||||
tags: [rollback, full]
|
||||
Loading…
Reference in a new issue