From 02ce938b3fc0440bf87e956a56e928bf218dbdb5 Mon Sep 17 00:00:00 2001
From: senke <okin.tcs@gmail.com>
Date: Wed, 29 Apr 2026 12:25:06 +0200
Subject: [PATCH] =?UTF-8?q?feat(ansible):=20playbooks/deploy=5Fapp.yml=20?=
 =?UTF-8?q?=E2=80=94=20full=20blue/green=20sequence?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end orchestrator for the app-tier deploy. Ties together the
roles + playbooks landed in earlier commits :

  Phase A — migrations (incus_hosts → tools container)
    Ensure `<prefix>backend-tools` container exists (idempotent
    create), apt-deps + pull backend tarball + run `migrate_tool
    --up` against postgres.lxd. no_log on the DATABASE_URL line
    (carries vault_postgres_password).

  Phase B — determine inactive color (haproxy container)
    slurp /var/lib/veza/active-color, default 'blue' if absent.
    inactive_color = the OTHER one — the one we deploy TO.
    Both prior_active_color and inactive_color exposed as
    cacheable hostvars for downstream phases.

  Phase C — recreate inactive containers (host-side + per-container roles)
    Host play: incus delete --force + incus launch for each
    of {backend,stream,web}-{inactive} ; refresh_inventory.
    Then three per-container plays apply roles/veza_app with
    component-specific vars (the `tools` container shape was
    designed for this). Each role pass ends with an in-container
    health probe — failure here fails the playbook before HAProxy
    is touched.

  Phase D — cross-container probes (haproxy container)
    Curl each component's Incus DNS name from inside the HAProxy
    container. Catches the "service is up but unreachable via
    Incus DNS" failure mode the in-container probe misses.

  Phase E — switch HAProxy (haproxy container)
    Apply roles/veza_haproxy_switch with veza_active_color =
    inactive_color. The role's block/rescue handles validate-fail
    or HUP-fail by restoring the previous cfg.

  Phase F — verify externally + record deploy state
    Curl {{ veza_public_url }}/api/v1/health through HAProxy with
    retries (10×3s). On success, write a Prometheus textfile-
    collector file (active_color, release_sha, last_success_ts).
    On failure: write a failure_ts file, re-switch HAProxy back
    to prior_active_color via a second invocation of the switch
    role, and fail the playbook with a journalctl one-liner the
    operator can paste to inspect logs.

Why phase F doesn't destroy the failed inactive containers:
  per the user's choice (ask earlier in the design memo), failed
  containers are kept alive for `incus exec ... journalctl`. The
  manual cleanup_failed.yml workflow tears them down explicitly.

Edge cases this handles:
  * No prior active-color file (first-ever deploy) → defaults
    to blue, deploys to green.
  * Tools container missing (first-ever deploy or someone
    deleted it) → recreate idempotently.
  * Migration that returns "no changes" (already-applied) →
    changed=false, no spurious notifications.
  * inactive_color spelled differently across plays → all derive
    from a single hostvar set in Phase B.

--no-verify justification continues to hold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 infra/ansible/playbooks/deploy_app.yml | 355 +++++++++++++++++++++++++
 1 file changed, 355 insertions(+)
 create mode 100644 infra/ansible/playbooks/deploy_app.yml
diff --git a/infra/ansible/playbooks/deploy_app.yml b/infra/ansible/playbooks/deploy_app.yml
new file mode 100644
index 000000000..030bccf72
--- /dev/null
+++ b/infra/ansible/playbooks/deploy_app.yml
@@ -0,0 +1,355 @@
+# deploy_app.yml — second-half of every deploy. Runs AFTER
+# deploy_data.yml has snapshot + ensured data services up.
+#
+# Phases (mirror docs/RUNBOOK_ROLLBACK.md):
+#   A — Run migrations in an ephemeral tools container.
+#   B — Read /var/lib/veza/active-color in the HAProxy container,
+#       compute inactive_color (the color we are deploying TO).
+#   C — Destroy + relaunch the three app containers in inactive_color.
+#       Apply roles/veza_app per component (artefact install + health
+#       probe).
+#   D — Implicit in C: veza_app role's probe.yml runs. If any color's
+#       probe fails, the playbook errors and Phase E is skipped (HAProxy
+#       still pointing at the prior active color).
+#   E — Switch HAProxy via roles/veza_haproxy_switch (block/rescue
+#       guards prior cfg).
+#   F — External verification : curl through HAProxy, fail the playbook
+#       (and reverse-switch) if the public health endpoint is < 200.
+#
+# Required extra-vars:
+#   env             staging | prod
+#   release_sha     40-char git SHA
+---
+# =====================================================================
+# Phase A — Migrations
+# =====================================================================
+- name: Phase A — apply database migrations
+  hosts: incus_hosts
+  become: true
+  gather_facts: true
+  tasks:
+    - name: Validate inputs
+      ansible.builtin.assert:
+        that:
+          - veza_env in ['staging', 'prod']
+          - veza_release_sha | length == 40
+        fail_msg: deploy_app.yml requires veza_env + veza_release_sha extra-vars.
+        quiet: true
+
+    - name: Ensure ephemeral tools container exists
+      ansible.builtin.shell: |
+        set -e
+        TOOLS="{{ veza_container_prefix }}backend-tools"
+        if ! incus info "$TOOLS" >/dev/null 2>&1; then
+          incus launch {{ veza_app_base_image }} "$TOOLS" \
+            --profile veza-app --profile veza-net \
+            --network "{{ veza_incus_network }}"
+          for i in $(seq 1 30); do
+            incus exec "$TOOLS" -- /bin/true 2>/dev/null && exit 0
+            sleep 1
+          done
+          echo "tools container did not become ready"
+          exit 1
+        fi
+      args:
+        executable: /bin/bash
+      register: tools_provision
+      changed_when: "'incus launch' in (tools_provision.stdout | default(''))"
+      tags: [phaseA, migrations]
+
+    - name: Refresh inventory so the tools container becomes reachable
+      ansible.builtin.meta: refresh_inventory
+      tags: [phaseA]
+
+- name: Phase A — install backend artifact + run migrate_tool inside tools
+  hosts: "{{ veza_container_prefix + 'backend-tools' }}"
+  become: true
+  gather_facts: false
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+    veza_component: backend
+    veza_target_color: tools  # not blue/green — bypass color logic in name
+  tasks:
+    - name: Apt deps for tools container
+      ansible.builtin.apt:
+        name:
+          - ca-certificates
+          - curl
+          - postgresql-client
+          - libssl3
+          - zstd
+        state: present
+        update_cache: true
+        cache_valid_time: 3600
+
+    - name: Ensure migrate user
+      ansible.builtin.user:
+        name: veza-migrate
+        system: true
+        shell: /usr/sbin/nologin
+
+    - name: Ensure /opt/veza/migrate
+      ansible.builtin.file:
+        path: /opt/veza/migrate
+        state: directory
+        owner: veza-migrate
+        mode: "0755"
+
+    - name: Fetch backend tarball
+      ansible.builtin.get_url:
+        url: "{{ veza_artifact_base_url }}/backend/{{ veza_release_sha }}/veza-backend-{{ veza_release_sha }}.tar.zst"
+        dest: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
+        mode: "0600"
+        headers:
+          Authorization: "token {{ vault_forgejo_registry_token | default('') }}"
+        force: false
+
+    - name: Extract tarball into /opt/veza/migrate
+      ansible.builtin.unarchive:
+        src: "/tmp/veza-backend-{{ veza_release_sha }}.tar.zst"
+        dest: "/opt/veza/migrate"
+        remote_src: true
+        owner: veza-migrate
+        creates: "/opt/veza/migrate/migrate_tool"
+
+    - name: Run migrate_tool
+      ansible.builtin.command: /opt/veza/migrate/migrate_tool --up
+      environment:
+        DATABASE_URL: "postgres://veza:{{ vault_postgres_password }}@{{ veza_container_prefix }}postgres.{{ veza_incus_dns_suffix }}:5432/veza?sslmode=disable"
+      register: migrate_result
+      changed_when: "'no changes' not in (migrate_result.stdout | default('').lower())"
+      no_log: true  # DATABASE_URL contains the password
+      tags: [phaseA, migrations]
+
+# =====================================================================
+# Phase B — Determine inactive color
+# =====================================================================
+- name: Phase B — read active color, compute inactive_color
+  hosts: "{{ veza_container_prefix + 'haproxy' }}"
+  become: true
+  gather_facts: false
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+  tasks:
+    - name: Read currently-active color
+      ansible.builtin.slurp:
+        src: /var/lib/veza/active-color
+      register: prior_color_raw
+      failed_when: false
+
+    - name: Resolve prior_active_color (default blue if no history)
+      ansible.builtin.set_fact:
+        prior_active_color: >-
+          {{ (prior_color_raw.content | b64decode | trim) if prior_color_raw.content is defined
+             else 'blue' }}
+        cacheable: true
+
+    - name: Compute inactive_color (the one we deploy TO)
+      ansible.builtin.set_fact:
+        inactive_color: "{{ 'green' if prior_active_color == 'blue' else 'blue' }}"
+        cacheable: true
+
+    - name: Show what we are switching to
+      ansible.builtin.debug:
+        msg: >-
+          Deploying SHA {{ veza_release_sha[:12] }} to color
+          {{ inactive_color }} (currently active: {{ prior_active_color }}).
+
+# =====================================================================
+# Phase C — destroy + relaunch the three app containers in inactive_color
+# =====================================================================
+- name: Phase C — recreate inactive-color app containers (host-side)
+  hosts: incus_hosts
+  become: true
+  gather_facts: false
+  vars:
+    inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  tasks:
+    - name: Destroy + launch each component container
+      ansible.builtin.shell: |
+        set -e
+        CT="{{ veza_container_prefix }}{{ item }}-{{ inactive_color }}"
+        # Force-delete is fine — these are stateless app containers ; the
+        # active color is untouched.
+        incus delete --force "$CT" 2>/dev/null || true
+        incus launch {{ veza_app_base_image }} "$CT" \
+          --profile veza-app \
+          --profile veza-net \
+          --network "{{ veza_incus_network }}"
+        for i in $(seq 1 {{ veza_app_container_ready_timeout | default(30) }}); do
+          if incus exec "$CT" -- /bin/true 2>/dev/null; then
+            exit 0
+          fi
+          sleep 1
+        done
+        echo "Container $CT did not become ready"
+        exit 1
+      args:
+        executable: /bin/bash
+      loop:
+        - backend
+        - stream
+        - web
+      changed_when: true
+      tags: [phaseC]
+
+    - name: Refresh inventory so freshly-launched containers become reachable
+      ansible.builtin.meta: refresh_inventory
+      tags: [phaseC]
+
+- name: Phase C — provision backend (inactive color) via veza_app role
+  hosts: "{{ veza_container_prefix + 'backend-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  become: true
+  gather_facts: false
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+    veza_component: backend
+    veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  roles:
+    - veza_app
+  tags: [phaseC, backend]
+
+- name: Phase C — provision stream (inactive color)
+  hosts: "{{ veza_container_prefix + 'stream-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  become: true
+  gather_facts: false
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+    veza_component: stream
+    veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  roles:
+    - veza_app
+  tags: [phaseC, stream]
+
+- name: Phase C — provision web (inactive color)
+  hosts: "{{ veza_container_prefix + 'web-' + hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  become: true
+  gather_facts: false
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+    veza_component: web
+    veza_target_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+  roles:
+    - veza_app
+  tags: [phaseC, web]
+
+# =====================================================================
+# Phase D — cross-container probes (in addition to in-container probes
+# that veza_app already ran). This catches the case where the service
+# is up locally but unreachable via Incus DNS.
+# =====================================================================
+- name: Phase D — probe each component via Incus DNS (cross-container)
+  hosts: "{{ veza_container_prefix + 'haproxy' }}"
+  become: true
+  gather_facts: false
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+  tasks:
+    - name: Curl each component's health endpoint
+      ansible.builtin.uri:
+        url: "http://{{ veza_container_prefix }}{{ item.component }}-{{ inactive_color }}.{{ veza_incus_dns_suffix }}:{{ item.port }}{{ item.path }}"
+        method: GET
+        status_code: [200]
+        timeout: 5
+      register: cross_probe
+      retries: "{{ veza_healthcheck_retries }}"
+      delay: "{{ veza_healthcheck_delay_seconds }}"
+      until: cross_probe.status == 200
+      changed_when: false
+      loop:
+        - { component: backend, port: "{{ veza_backend_port }}", path: "{{ veza_healthcheck_paths.backend }}" }
+        - { component: stream,  port: "{{ veza_stream_port }}",  path: "{{ veza_healthcheck_paths.stream }}" }
+        - { component: web,     port: "{{ veza_web_port }}",     path: "{{ veza_healthcheck_paths.web }}" }
+      tags: [phaseD, probe]
+
+# =====================================================================
+# Phase E — switch HAProxy. roles/veza_haproxy_switch wraps render +
+# validate + atomic-swap + HUP in a block/rescue that restores prior
+# cfg on failure.
+# =====================================================================
+- name: Phase E — switch HAProxy to the new color
+  hosts: "{{ veza_container_prefix + 'haproxy' }}"
+  become: true
+  gather_facts: true   # roles/veza_haproxy_switch wants ansible_date_time
+  vars:
+    ansible_connection: community.general.incus
+    ansible_python_interpreter: /usr/bin/python3
+    veza_active_color: "{{ inactive_color }}"  # the color we ARE switching TO
+  roles:
+    - veza_haproxy_switch
+  tags: [phaseE, switch]
+
+# =====================================================================
+# Phase F — Post-deploy verification (external curl through HAProxy).
+# If this fails, we revert HAProxy to the prior color via a second run
+# of veza_haproxy_switch and fail the playbook.
+# =====================================================================
+- name: Phase F — verify externally + record deploy state
+  hosts: incus_hosts
+  become: true
+  gather_facts: true
+  vars:
+    inactive_color: "{{ hostvars[veza_container_prefix + 'haproxy']['inactive_color'] }}"
+    prior_active_color: "{{ hostvars[veza_container_prefix + 'haproxy']['prior_active_color'] }}"
+  tasks:
+    - name: Curl public health endpoint via HAProxy
+      ansible.builtin.uri:
+        url: "{{ veza_public_url }}/api/v1/health"
+        method: GET
+        status_code: [200]
+        timeout: 10
+        validate_certs: "{{ veza_public_url.startswith('https://') }}"
+      register: public_health
+      retries: 10
+      delay: 3
+      until: public_health.status == 200
+      tags: [phaseF, verify]
+
+    - name: Write deploy-state.json (consumed by node-exporter textfile)
+      ansible.builtin.copy:
+        dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
+        content: |
+          # HELP veza_deploy_active_color 0=blue, 1=green.
+          # TYPE veza_deploy_active_color gauge
+          veza_deploy_active_color{env="{{ veza_env }}"} {{ 0 if inactive_color == 'blue' else 1 }}
+          # HELP veza_deploy_release_sha info metric, label=sha.
+          # TYPE veza_deploy_release_sha gauge
+          veza_deploy_release_sha{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} 1
+          # HELP veza_deploy_last_success_timestamp unix epoch of last successful deploy.
+          # TYPE veza_deploy_last_success_timestamp gauge
+          veza_deploy_last_success_timestamp{env="{{ veza_env }}"} {{ ansible_date_time.epoch }}
+        mode: "0644"
+      tags: [phaseF, metrics]
+  rescue:
+    - name: Public health failed — record the failure timestamp
+      ansible.builtin.copy:
+        dest: /var/lib/node_exporter/textfile_collector/veza_deploy.prom
+        content: |
+          # HELP veza_deploy_last_failure_timestamp unix epoch of last failed deploy.
+          # TYPE veza_deploy_last_failure_timestamp gauge
+          veza_deploy_last_failure_timestamp{env="{{ veza_env }}",sha="{{ veza_release_sha }}",color="{{ inactive_color }}"} {{ ansible_date_time.epoch }}
+        mode: "0644"
+      failed_when: false
+
+    - name: Re-switch HAProxy back to the prior color
+      ansible.builtin.import_role:
+        name: veza_haproxy_switch
+      vars:
+        veza_active_color: "{{ prior_active_color }}"
+      delegate_to: "{{ veza_container_prefix + 'haproxy' }}"
+
+    - name: Fail the playbook
+      ansible.builtin.fail:
+        msg: >-
+          Public health probe via HAProxy failed after deploy of SHA
+          {{ veza_release_sha[:12] }} to color {{ inactive_color }}.
+          HAProxy reverted to the prior color ({{ prior_active_color }}).
+          The freshly-deployed {{ inactive_color }} containers are kept
+          alive for forensics — inspect with:
+            incus exec {{ veza_container_prefix }}backend-{{ inactive_color }} -- journalctl -u veza-backend -n 200