diff --git a/infra/ansible/roles/veza_haproxy_switch/README.md b/infra/ansible/roles/veza_haproxy_switch/README.md new file mode 100644 index 000000000..08794cab5 --- /dev/null +++ b/infra/ansible/roles/veza_haproxy_switch/README.md @@ -0,0 +1,47 @@ +# `veza_haproxy_switch` role + +Atomically swap HAProxy's active color. Runs against the +`{{ veza_container_prefix }}haproxy` container after `veza_app` has +recreated + health-probed all three components in the inactive color. + +## Why a separate role from `haproxy`? + +- `roles/haproxy` provisions a fresh HAProxy container — install + the package, lay down the *initial* config, enable the systemd + unit. It runs once when the staging/prod env is bootstrapped and + occasionally when the global config shape changes. +- `roles/veza_haproxy_switch` performs the *per-deploy* delta — + re-template the cfg with a new `veza_active_color`, validate, + swap, HUP. It runs once at the end of every successful deploy. + +Splitting them keeps the per-deploy path narrow (no apt, no service +install) and lets `roles/haproxy` remain idempotent when the global +shape hasn't changed. + +## Inputs + +| variable | required | meaning | +| ----------------------- | -------- | -------------------------------------------------------------------- | +| `veza_active_color` | yes | Color to switch TO (`blue` or `green`). Becomes the new active. | +| `veza_release_sha` | yes | SHA being deployed. Logged in the active-color history file. | +| `veza_container_prefix` | inherit | From group_vars/.yml. | +| `haproxy_topology` | inherit | Should be `blue-green` for this role to make sense. | + +## Failure semantics + +The render → validate → atomic-swap → HUP sequence runs in an +Ansible `block:` with a `rescue:` that restores `haproxy.cfg.bak` +(captured before the swap) and re-HUPs. So an invalid config or a +HUP failure leaves HAProxy serving the *previous* active color +exactly as before — the deploy as a whole then fails on the playbook +level. + +## What the role does NOT do + +- It does not destroy or recreate the HAProxy container. That's a + one-time operation under `roles/haproxy`. +- It does not touch app containers — by the time this role runs, + blue/green app containers are both healthy. +- It does not remove the previously-active color's containers. They + survive (intentional) so a rollback can flip back instantly. The + next deploy naturally recycles them. diff --git a/infra/ansible/roles/veza_haproxy_switch/defaults/main.yml b/infra/ansible/roles/veza_haproxy_switch/defaults/main.yml new file mode 100644 index 000000000..2a38419a5 --- /dev/null +++ b/infra/ansible/roles/veza_haproxy_switch/defaults/main.yml @@ -0,0 +1,18 @@ +--- +# These should be set by the caller — defaults here are guards that +# fail loud if the caller forgot to pass them. +veza_active_color: "" +veza_release_sha: "" + +# Paths inside the HAProxy container. +haproxy_cfg_path: /etc/haproxy/haproxy.cfg +haproxy_cfg_new_path: /etc/haproxy/haproxy.cfg.new +haproxy_cfg_backup_path: /etc/haproxy/haproxy.cfg.bak +haproxy_state_dir: /var/lib/veza +haproxy_active_color_file: /var/lib/veza/active-color +haproxy_active_color_history: /var/lib/veza/active-color.history + +# How many history entries to keep before pruning. The rollback role +# offers point-in-time switch within this window without redeploying +# the artefact. +haproxy_active_color_history_keep: 5 diff --git a/infra/ansible/roles/veza_haproxy_switch/handlers/main.yml b/infra/ansible/roles/veza_haproxy_switch/handlers/main.yml new file mode 100644 index 000000000..4dbc8c570 --- /dev/null +++ b/infra/ansible/roles/veza_haproxy_switch/handlers/main.yml @@ -0,0 +1,9 @@ +--- +# HUP haproxy via systemd reload (graceful — drains old workers). +# Used both on success (after atomic swap) and on rescue (after +# restoring backup). +- name: Reload haproxy + ansible.builtin.systemd: + name: haproxy + state: reloaded + listen: "veza-haproxy reload" diff --git a/infra/ansible/roles/veza_haproxy_switch/meta/main.yml b/infra/ansible/roles/veza_haproxy_switch/meta/main.yml new file mode 100644 index 000000000..221b4b85b --- /dev/null +++ b/infra/ansible/roles/veza_haproxy_switch/meta/main.yml @@ -0,0 +1,16 @@ +--- +galaxy_info: + role_name: veza_haproxy_switch + author: Veza Ops + description: >- + Atomically swap HAProxy's active color (blue/green) and persist + the new state. Runs once per deploy, after veza_app has health- + probed all components in the inactive color. Block/rescue + guarantees HAProxy never lands on a bad config. + license: proprietary + min_ansible_version: "2.15" + platforms: + - name: Debian + versions: ["13"] + +dependencies: [] diff --git a/infra/ansible/roles/veza_haproxy_switch/tasks/main.yml b/infra/ansible/roles/veza_haproxy_switch/tasks/main.yml new file mode 100644 index 000000000..14127289c --- /dev/null +++ b/infra/ansible/roles/veza_haproxy_switch/tasks/main.yml @@ -0,0 +1,142 @@ +# Atomic blue/green switch. The HAProxy template lives in +# roles/haproxy/templates/haproxy.cfg.j2 — it reads veza_active_color +# to render the right `backup` directives. We re-template, validate, +# atomic-swap, HUP. +# +# Block/rescue: any failure in the four-step sequence restores +# haproxy.cfg from the backup we capture before touching anything. +# That way, an invalid template or a HUP error never leaves HAProxy +# serving from a stale or broken cfg — it stays on whatever was +# active when the role started. +--- +- name: Validate inputs + ansible.builtin.assert: + that: + - veza_active_color in ['blue', 'green'] + - veza_release_sha | length == 40 + fail_msg: >- + veza_haproxy_switch role requires veza_active_color (blue|green) + and veza_release_sha (40-char git SHA). Got: color={{ veza_active_color }} + sha={{ veza_release_sha }}. + quiet: true + tags: [veza_haproxy_switch, always] + +- name: Ensure veza state dir exists in HAProxy container + ansible.builtin.file: + path: "{{ haproxy_state_dir }}" + state: directory + owner: root + group: root + mode: "0755" + tags: [veza_haproxy_switch] + +- name: Read currently-active color (if any) + ansible.builtin.slurp: + src: "{{ haproxy_active_color_file }}" + register: prior_color_raw + failed_when: false + changed_when: false + tags: [veza_haproxy_switch] + +- name: Resolve prior_active_color (default blue if no history) + ansible.builtin.set_fact: + prior_active_color: >- + {{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined + else 'blue' }} + tags: [veza_haproxy_switch] + +- name: Switch sequence (block/rescue — restores cfg on any failure) + block: + - name: Backup current haproxy.cfg + ansible.builtin.copy: + src: "{{ haproxy_cfg_path }}" + dest: "{{ haproxy_cfg_backup_path }}" + remote_src: true + mode: "0640" + tags: [veza_haproxy_switch] + + - name: Render fresh haproxy.cfg with new active_color + ansible.builtin.template: + src: "{{ playbook_dir }}/../roles/haproxy/templates/haproxy.cfg.j2" + dest: "{{ haproxy_cfg_new_path }}" + owner: root + group: haproxy + mode: "0640" + validate: "haproxy -f %s -c -q" + vars: + # Make absolutely sure the template sees the new color we are + # switching to — set both names because the older template + # used `veza_active_color` and a future revision might use + # `haproxy_active_color`. + haproxy_active_color: "{{ veza_active_color }}" + tags: [veza_haproxy_switch] + + - name: Atomic swap — mv haproxy.cfg.new → haproxy.cfg + ansible.builtin.command: mv -f "{{ haproxy_cfg_new_path }}" "{{ haproxy_cfg_path }}" + changed_when: true + tags: [veza_haproxy_switch] + + - name: HUP haproxy (graceful reload, no connection drop) + ansible.builtin.systemd: + name: haproxy + state: reloaded + tags: [veza_haproxy_switch] + rescue: + - name: Restore haproxy.cfg from backup + ansible.builtin.command: mv -f "{{ haproxy_cfg_backup_path }}" "{{ haproxy_cfg_path }}" + when: haproxy_cfg_backup_path is file or true # always try; benign if backup missing + changed_when: true + tags: [veza_haproxy_switch] + + - name: HUP haproxy back to the prior config + ansible.builtin.systemd: + name: haproxy + state: reloaded + failed_when: false + tags: [veza_haproxy_switch] + + - name: Report the failure + ansible.builtin.fail: + msg: >- + HAProxy switch to color {{ veza_active_color }} (sha + {{ veza_release_sha[:12] }}) failed — config rolled back + to the prior state. HAProxy continues serving from + {{ prior_active_color }}. Inspect the validate step's + stderr in the playbook output above. + +# Success path: persist new active color + history. +- name: Write new active color + ansible.builtin.copy: + dest: "{{ haproxy_active_color_file }}" + content: "{{ veza_active_color }}\n" + owner: root + group: root + mode: "0644" + tags: [veza_haproxy_switch] + +- name: Append to active-color history + ansible.builtin.lineinfile: + path: "{{ haproxy_active_color_history }}" + line: "{{ ansible_date_time.iso8601 }} sha={{ veza_release_sha }} color={{ veza_active_color }} prior={{ prior_active_color }}" + create: true + insertbefore: BOF + mode: "0644" + tags: [veza_haproxy_switch] + +- name: Prune history beyond keep limit + ansible.builtin.shell: | + set -e + if [ -f "{{ haproxy_active_color_history }}" ]; then + head -n {{ haproxy_active_color_history_keep }} "{{ haproxy_active_color_history }}" > "{{ haproxy_active_color_history }}.tmp" + mv -f "{{ haproxy_active_color_history }}.tmp" "{{ haproxy_active_color_history }}" + fi + args: + executable: /bin/bash + changed_when: false + tags: [veza_haproxy_switch] + +- name: Drop the now-stale backup + ansible.builtin.file: + path: "{{ haproxy_cfg_backup_path }}" + state: absent + tags: [veza_haproxy_switch]