From 3a67763d6fb5f3fc2982ff193d5dc31f416aa978 Mon Sep 17 00:00:00 2001 From: senke Date: Wed, 29 Apr 2026 14:36:40 +0200 Subject: [PATCH] =?UTF-8?q?feat(ansible):=20playbooks/{cleanup=5Ffailed,ro?= =?UTF-8?q?llback}.yml=20=E2=80=94=20manual=20recovery=20paths?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two operator-only playbooks (workflow_dispatch in Forgejo) for the escape hatches docs/RUNBOOK_ROLLBACK.md will document. playbooks/cleanup_failed.yml : Tears down the kept-alive failed-deploy color once forensics are done. Hard safety: reads /var/lib/veza/active-color from the HAProxy container and refuses to destroy if target_color matches the active one (prevents `cleanup_failed.yml -e target_color=blue` when blue is what's serving traffic). Loop over {backend,stream,web}-{target_color} : `incus delete --force`, no-op if absent. playbooks/rollback.yml : Two modes selected by `-e mode=`: fast — HAProxy-only flip. Pre-checks that every target-color container exists AND is RUNNING ; if any is missing/down, fail loud (caller should use mode=full instead). Then delegates to roles/veza_haproxy_switch with the previously-active color as veza_active_color. ~5s wall time. full — Re-runs the full deploy_app.yml pipeline with -e veza_release_sha=. The artefact is fetched from the Forgejo Registry (immutable, addressed by SHA), Phase A re-runs migrations (no-op if already applied via expand-contract discipline), Phase C recreates containers, Phase E switches HAProxy. ~5-10 min wall time. Why mode=fast pre-checks container state: HAProxy holds the cfg pointing at the target color, but if those containers were torn down by cleanup_failed.yml or by a more recent deploy, the flip would land on dead backends. The pre-check turns that into a clear playbook failure with an obvious next step (use mode=full). Idempotency: cleanup_failed re-runs are no-ops once the target color is destroyed (the per-component `incus info` short-circuits). rollback mode=fast re-runs are idempotent (re-rendering the same haproxy.cfg is a no-op + handler doesn't refire on no-diff). --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) --- infra/ansible/playbooks/cleanup_failed.yml | 83 +++++++++++++++ infra/ansible/playbooks/rollback.yml | 113 +++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 infra/ansible/playbooks/cleanup_failed.yml create mode 100644 infra/ansible/playbooks/rollback.yml diff --git a/infra/ansible/playbooks/cleanup_failed.yml b/infra/ansible/playbooks/cleanup_failed.yml new file mode 100644 index 000000000..0701e53ad --- /dev/null +++ b/infra/ansible/playbooks/cleanup_failed.yml @@ -0,0 +1,83 @@ +# cleanup_failed.yml — destroy the app containers of a specific color. +# Used when a deploy_app.yml run failed Phase D or Phase F and the +# operator has finished forensics on the kept-alive failed color. +# +# Required extra-vars: +# env staging | prod +# target_color blue | green (the color to tear down) +# +# Safety: refuses to destroy the CURRENTLY-ACTIVE color. Active color +# is read from the HAProxy container's /var/lib/veza/active-color. +# +# Caller (workflow_dispatch only): +# ansible-playbook -i inventory/{{env}}.yml playbooks/cleanup_failed.yml \ +# -e env={{env}} -e target_color={{color}} +--- +- name: Validate inputs and refuse to nuke the active color + hosts: incus_hosts + become: true + gather_facts: false + tasks: + - name: Assert required vars + ansible.builtin.assert: + that: + - veza_env is defined + - veza_env in ['staging', 'prod'] + - target_color is defined + - target_color in ['blue', 'green'] + fail_msg: cleanup_failed.yml requires veza_env + target_color. + quiet: true + + - name: Read active color from HAProxy container + ansible.builtin.shell: | + incus exec "{{ veza_container_prefix }}haproxy" -- \ + cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]' + args: + executable: /bin/bash + register: active_color_raw + changed_when: false + failed_when: false + + - name: Resolve current_active_color + ansible.builtin.set_fact: + current_active_color: "{{ active_color_raw.stdout if active_color_raw.stdout else 'blue' }}" + + - name: Refuse if target_color matches the active color + ansible.builtin.fail: + msg: >- + target_color={{ target_color }} matches the currently-active + color in HAProxy. Refusing to destroy live containers. + Switch HAProxy first via rollback.yml or a re-deploy. + when: target_color == current_active_color + +- name: Destroy the inactive-color app containers + hosts: incus_hosts + become: true + gather_facts: false + tasks: + - name: Force-delete each component container + ansible.builtin.shell: | + set -e + CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}" + if incus info "$CT" >/dev/null 2>&1; then + incus delete --force "$CT" + echo "Destroyed $CT" + else + echo "$CT does not exist, skip" + fi + args: + executable: /bin/bash + loop: + - backend + - stream + - web + register: cleanup_result + changed_when: "'Destroyed' in (cleanup_result.stdout | default(''))" + tags: [cleanup] + + - name: Report what was destroyed + ansible.builtin.debug: + msg: | + Cleanup of color {{ target_color }} in env {{ veza_env }} complete. + Active color unchanged: {{ current_active_color }}. + Next deploy will recreate {{ target_color }} containers from scratch. diff --git a/infra/ansible/playbooks/rollback.yml b/infra/ansible/playbooks/rollback.yml new file mode 100644 index 000000000..8956a80f4 --- /dev/null +++ b/infra/ansible/playbooks/rollback.yml @@ -0,0 +1,113 @@ +# rollback.yml — two modes : +# +# 1. fast : flip HAProxy back to the previous active color. +# Works only if those containers are still alive +# (i.e., the next deploy has NOT yet recycled them). +# Effect time : ~5 seconds. +# +# 2. full : redeploy a specific release_sha by re-running +# deploy_app.yml with that SHA. Works whenever the +# tarball is still in the Forgejo Registry. Effect +# time : ~5-10 minutes. +# +# Required extra-vars: +# env staging | prod +# mode fast | full +# target_color (mode=fast only) the color to flip TO +# release_sha (mode=full only) the SHA to redeploy +# +# Caller (workflow_dispatch only — see .forgejo/workflows/rollback.yml): +# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \ +# -e env={{env}} -e mode=fast -e target_color=blue +# ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \ +# -e env={{env}} -e mode=full -e release_sha= +--- +- name: Validate inputs + hosts: incus_hosts + become: true + gather_facts: false + tasks: + - name: Assert env + mode + ansible.builtin.assert: + that: + - veza_env is defined + - veza_env in ['staging', 'prod'] + - mode is defined + - mode in ['fast', 'full'] + fail_msg: rollback.yml requires veza_env + mode (fast|full). + quiet: true + + - name: Assert target_color when mode=fast + ansible.builtin.assert: + that: + - target_color is defined + - target_color in ['blue', 'green'] + fail_msg: rollback.yml mode=fast requires target_color (blue|green). + quiet: true + when: mode == 'fast' + + - name: Assert release_sha when mode=full + ansible.builtin.assert: + that: + - veza_release_sha is defined + - veza_release_sha | length == 40 + fail_msg: rollback.yml mode=full requires release_sha (40-char SHA). + quiet: true + when: mode == 'full' + +# --------------------------------------------------------------------- +# mode=fast → HAProxy flip only. +# --------------------------------------------------------------------- +- name: Fast rollback — verify target_color containers are alive + hosts: incus_hosts + become: true + gather_facts: false + tasks: + - name: Check each target-color container exists + ansible.builtin.shell: | + set -e + CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}" + if ! incus info "$CT" >/dev/null 2>&1; then + echo "MISSING $CT" + exit 1 + fi + STATE=$(incus list "$CT" -c s --format csv) + if [ "$STATE" != "RUNNING" ]; then + echo "$CT is $STATE (not RUNNING)" + exit 1 + fi + echo "OK $CT" + args: + executable: /bin/bash + loop: + - backend + - stream + - web + changed_when: false + register: alive_check + when: mode == 'fast' + tags: [rollback, fast] + +- name: Fast rollback — flip HAProxy + hosts: "{{ veza_container_prefix + 'haproxy' }}" + become: true + gather_facts: true + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + veza_active_color: "{{ target_color }}" + # Fast rollback re-uses the previous SHA from the history file. + veza_release_sha: "{{ lookup('ansible.builtin.file', '/var/lib/veza/active-color.history', errors='ignore') | regex_search('sha=([0-9a-f]+)', '\\1') | default(['rollback'], true) | first }}" + roles: + - veza_haproxy_switch + when: mode == 'fast' + tags: [rollback, fast] + +# --------------------------------------------------------------------- +# mode=full → re-import deploy_app.yml with the rollback SHA. +# Functionally identical to a fresh deploy of an older release. +# --------------------------------------------------------------------- +- name: Full rollback — delegate to deploy_app.yml with release_sha={{ veza_release_sha | default('') }} + ansible.builtin.import_playbook: deploy_app.yml + when: mode == 'full' + tags: [rollback, full]