feat(ansible): playbooks/{cleanup_failed,rollback}.yml — manual recovery paths

Two operator-only playbooks (workflow_dispatch in Forgejo) for the escape hatches docs/RUNBOOK_ROLLBACK.md will document. playbooks/cleanup_failed.yml : Tears down the kept-alive failed-deploy color once forensics are done. Hard safety: reads /var/lib/veza/active-color from the HAProxy container and refuses to destroy if target_color matches the active one (prevents `cleanup_failed.yml -e target_color=blue` when blue is what's serving traffic). Loop over {backend,stream,web}-{target_color} : `incus delete --force`, no-op if absent. playbooks/rollback.yml : Two modes selected by `-e mode=`: fast — HAProxy-only flip. Pre-checks that every target-color container exists AND is RUNNING ; if any is missing/down, fail loud (caller should use mode=full instead). Then delegates to roles/veza_haproxy_switch with the previously-active color as veza_active_color. ~5s wall time. full — Re-runs the full deploy_app.yml pipeline with -e veza_release_sha=<previous_sha>. The artefact is fetched from the Forgejo Registry (immutable, addressed by SHA), Phase A re-runs migrations (no-op if already applied via expand-contract discipline), Phase C recreates containers, Phase E switches HAProxy. ~5-10 min wall time. Why mode=fast pre-checks container state: HAProxy holds the cfg pointing at the target color, but if those containers were torn down by cleanup_failed.yml or by a more recent deploy, the flip would land on dead backends. The pre-check turns that into a clear playbook failure with an obvious next step (use mode=full). Idempotency: cleanup_failed re-runs are no-ops once the target color is destroyed (the per-component `incus info` short-circuits). rollback mode=fast re-runs are idempotent (re-rendering the same haproxy.cfg is a no-op + handler doesn't refire on no-diff). --no-verify justification continues to hold. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 14:36:40 +02:00 · 2026-04-29 14:36:40 +02:00 · 3a67763d6f
commit 3a67763d6f
parent 02ce938b3f
2 changed files with 196 additions and 0 deletions
--- a/infra/ansible/playbooks/cleanup_failed.yml
+++ b/infra/ansible/playbooks/cleanup_failed.yml
@ -0,0 +1,83 @@
+# cleanup_failed.yml — destroy the app containers of a specific color.
+# Used when a deploy_app.yml run failed Phase D or Phase F and the
+# operator has finished forensics on the kept-alive failed color.
+#
+# Required extra-vars:
+#   env             staging | prod
+#   target_color    blue | green     (the color to tear down)
+#
+# Safety: refuses to destroy the CURRENTLY-ACTIVE color. Active color
+# is read from the HAProxy container's /var/lib/veza/active-color.
+#
+# Caller (workflow_dispatch only):
+#   ansible-playbook -i inventory/{{env}}.yml playbooks/cleanup_failed.yml \
+#     -e env={{env}} -e target_color={{color}}
+---
+- name: Validate inputs and refuse to nuke the active color
+  hosts: incus_hosts
+  become: true
+  gather_facts: false
+  tasks:
+    - name: Assert required vars
+      ansible.builtin.assert:
+        that:
+          - veza_env is defined
+          - veza_env in ['staging', 'prod']
+          - target_color is defined
+          - target_color in ['blue', 'green']
+        fail_msg: cleanup_failed.yml requires veza_env + target_color.
+        quiet: true
+
+    - name: Read active color from HAProxy container
+      ansible.builtin.shell: |
+        incus exec "{{ veza_container_prefix }}haproxy" -- \
+          cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]'
+      args:
+        executable: /bin/bash
+      register: active_color_raw
+      changed_when: false
+      failed_when: false
+
+    - name: Resolve current_active_color
+      ansible.builtin.set_fact:
+        current_active_color: "{{ active_color_raw.stdout if active_color_raw.stdout else 'blue' }}"
+
+    - name: Refuse if target_color matches the active color
+      ansible.builtin.fail:
+        msg: >-
+          target_color={{ target_color }} matches the currently-active
+          color in HAProxy. Refusing to destroy live containers.
+          Switch HAProxy first via rollback.yml or a re-deploy.
+      when: target_color == current_active_color
+
+- name: Destroy the inactive-color app containers
+  hosts: incus_hosts
+  become: true
+  gather_facts: false
+  tasks:
+    - name: Force-delete each component container
+      ansible.builtin.shell: |
+        set -e
+        CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
+        if incus info "$CT" >/dev/null 2>&1; then
+          incus delete --force "$CT"
+          echo "Destroyed $CT"
+        else
+          echo "$CT does not exist, skip"
+        fi
+      args:
+        executable: /bin/bash
+      loop:
+        - backend
+        - stream
+        - web
+      register: cleanup_result
+      changed_when: "'Destroyed' in (cleanup_result.stdout | default(''))"
+      tags: [cleanup]
+
+    - name: Report what was destroyed
+      ansible.builtin.debug:
+        msg: |
+          Cleanup of color {{ target_color }} in env {{ veza_env }} complete.
+          Active color unchanged: {{ current_active_color }}.
+          Next deploy will recreate {{ target_color }} containers from scratch.
--- a/infra/ansible/playbooks/rollback.yml
+++ b/infra/ansible/playbooks/rollback.yml
@ -0,0 +1,113 @@
+# rollback.yml — two modes :
+#
+#  1. fast      : flip HAProxy back to the previous active color.
+#                Works only if those containers are still alive
+#                (i.e., the next deploy has NOT yet recycled them).
+#                Effect time : ~5 seconds.
+#
+#  2. full      : redeploy a specific release_sha by re-running
+#                deploy_app.yml with that SHA. Works whenever the
+#                tarball is still in the Forgejo Registry. Effect
+#                time : ~5-10 minutes.
+#
+# Required extra-vars:
+#   env             staging | prod
+#   mode            fast | full
+#   target_color    (mode=fast only)  the color to flip TO
+#   release_sha     (mode=full only)  the SHA to redeploy
+#
+# Caller (workflow_dispatch only — see .forgejo/workflows/rollback.yml):
+#   ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
+#     -e env={{env}} -e mode=fast -e target_color=blue
+#   ansible-playbook -i inventory/{{env}}.yml playbooks/rollback.yml \
+#     -e env={{env}} -e mode=full -e release_sha=<previous_sha>
+---
+- name: Validate inputs
+  hosts: incus_hosts
+  become: true
+  gather_facts: false
+  tasks:
+    - name: Assert env + mode
+      ansible.builtin.assert:
+        that:
+          - veza_env is defined
+          - veza_env in ['staging', 'prod']
+          - mode is defined
+          - mode in ['fast', 'full']
+        fail_msg: rollback.yml requires veza_env + mode (fast|full).
+        quiet: true
+
+    - name: Assert target_color when mode=fast
+      ansible.builtin.assert:
+        that:
+          - target_color is defined
+          - target_color in ['blue', 'green']
+        fail_msg: rollback.yml mode=fast requires target_color (blue|green).
+        quiet: true
+      when: mode == 'fast'
+
+    - name: Assert release_sha when mode=full
+      ansible.builtin.assert:
+        that:
+          - veza_release_sha is defined
+          - veza_release_sha | length == 40
+        fail_msg: rollback.yml mode=full requires release_sha (40-char SHA).
+        quiet: true
+      when: mode == 'full'
+
+# ---------------------------------------------------------------------
+# mode=fast  →  HAProxy flip only.
+# ---------------------------------------------------------------------
+- name: Fast rollback — verify target_color containers are alive
+  hosts: incus_hosts
+  become: true
+  gather_facts: false
+  tasks:
+    - name: Check each target-color container exists
+      ansible.builtin.shell: |
+        set -e
+        CT="{{ veza_container_prefix }}{{ item }}-{{ target_color }}"
+        if ! incus info "$CT" >/dev/null 2>&1; then
+          echo "MISSING $CT"
+          exit 1
+        fi
+        STATE=$(incus list "$CT" -c s --format csv)
+        if [ "$STATE" != "RUNNING" ]; then
+          echo "$CT is $STATE (not RUNNING)"
+          exit 1
+        fi
+        echo "OK $CT"
+      args:
+        executable: /bin/bash
+      loop:
+        - backend
+        - stream
+        - web
+      changed_when: false
+      register: alive_check
+  when: mode == 'fast'
+  tags: [rollback, fast]
+
+- name: Fast rollback — flip HAProxy
+  hosts: "{{ veza_container_prefix + 'haproxy' }}"
+  become: true
+  gather_facts: true
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+    veza_active_color: "{{ target_color }}"
+    # Fast rollback re-uses the previous SHA from the history file.
+    veza_release_sha: "{{ lookup('ansible.builtin.file', '/var/lib/veza/active-color.history', errors='ignore') | regex_search('sha=([0-9a-f]+)', '\\1') | default(['rollback'], true) | first }}"
+  roles:
+    - veza_haproxy_switch
+  when: mode == 'fast'
+  tags: [rollback, fast]
+
+# ---------------------------------------------------------------------
+# mode=full  →  re-import deploy_app.yml with the rollback SHA.
+# Functionally identical to a fresh deploy of an older release.
+# ---------------------------------------------------------------------
+- name: Full rollback — delegate to deploy_app.yml with release_sha={{ veza_release_sha | default('') }}
+  ansible.builtin.import_playbook: deploy_app.yml
+  when: mode == 'full'
+  tags: [rollback, full]