Compare commits
4 commits
172729bdff
...
594204fb86
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
594204fb86 | ||
|
|
6de2923821 | ||
|
|
22d09dcbbb | ||
|
|
f4eb4732dd |
15 changed files with 1142 additions and 19 deletions
|
|
@ -120,3 +120,142 @@ groups:
|
|||
unreachable means we are at-or-past the redundancy ceiling.
|
||||
Any further failure causes data unavailability. Page now.
|
||||
runbook_url: "https://veza.fr/runbooks/minio-nodes-unreachable"
|
||||
|
||||
# W5+ : Forgejo+Ansible+Incus deploy pipeline. The deploy_app.yml
|
||||
# playbook writes a textfile-collector .prom file under
|
||||
# /var/lib/node_exporter/textfile_collector/veza_deploy.prom on every
|
||||
# deploy attempt. node_exporter scrapes it and exposes the metrics
|
||||
# via the standard /metrics endpoint, no Pushgateway needed.
|
||||
- name: veza_deploy
|
||||
rules:
|
||||
- alert: VezaDeployFailed
|
||||
# last_failure_timestamp newer than last_success_timestamp.
|
||||
# 5m soak so a deploy in progress (writes failure THEN switches
|
||||
# back, which writes success on the next successful deploy)
|
||||
# doesn't transient-trigger.
|
||||
expr: |
|
||||
max(veza_deploy_last_failure_timestamp) by (env) >
|
||||
max(veza_deploy_last_success_timestamp or vector(0)) by (env)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
page: "true"
|
||||
annotations:
|
||||
summary: "Veza deploy to {{ $labels.env }} failed"
|
||||
description: |
|
||||
The most recent deploy attempt to {{ $labels.env }} failed
|
||||
and HAProxy was reverted to the prior color. The failed
|
||||
color's containers are kept alive for forensics. Inspect:
|
||||
gh workflow run cleanup-failed.yml -f env={{ $labels.env }} -f color=<failed_color>
|
||||
once the operator has read the journalctl output.
|
||||
runbook_url: "https://veza.fr/runbooks/deploy-failed"
|
||||
|
||||
- alert: VezaStaleDeploy
|
||||
# Staging cadence is daily-ish; a 7-day silence smells like
|
||||
# CI is broken or the team is on holiday with prod still
|
||||
# serving an old SHA. Prod is monthly-ish so 30 days.
|
||||
# Two separate alerts because the threshold differs.
|
||||
expr: |
|
||||
(time() - max(veza_deploy_last_success_timestamp{env="staging"}) by (env)) > (7 * 86400)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
page: "false"
|
||||
annotations:
|
||||
summary: "Staging deploy hasn't succeeded in 7+ days"
|
||||
description: |
|
||||
Last successful staging deploy was
|
||||
{{ $value | humanizeDuration }} ago. Pipeline likely broken
|
||||
(Forgejo runner offline ? secret expired ?).
|
||||
|
||||
- alert: VezaStaleDeployProd
|
||||
expr: |
|
||||
(time() - max(veza_deploy_last_success_timestamp{env="prod"}) by (env)) > (30 * 86400)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
page: "false"
|
||||
annotations:
|
||||
summary: "Prod deploy hasn't succeeded in 30+ days"
|
||||
description: |
|
||||
Last successful prod deploy was {{ $value | humanizeDuration }}
|
||||
ago. Tag-based release cadence likely stalled.
|
||||
|
||||
- alert: VezaFailedColorAlive
|
||||
# The textfile collector also exposes a custom metric
|
||||
# `veza_deploy_failed_color_alive{env=...,color=...}` set by
|
||||
# a small periodic script that scans `incus list` for
|
||||
# containers in the failed-deploy state. (Stub script lives
|
||||
# under scripts/observability/scan-failed-colors.sh.)
|
||||
# Threshold 24h so the operator has at least a working day
|
||||
# to do post-mortem before the alert fires.
|
||||
expr: max(veza_deploy_failed_color_alive) by (env, color) > 0
|
||||
for: 24h
|
||||
labels:
|
||||
severity: warning
|
||||
page: "false"
|
||||
annotations:
|
||||
summary: "Failed deploy color {{ $labels.color }} still alive in {{ $labels.env }}"
|
||||
description: |
|
||||
A previously-failed-deploy color has been kept alive for
|
||||
24+ hours. Either complete forensics + run cleanup-failed,
|
||||
or the next deploy will recycle it automatically.
|
||||
|
||||
# v1.0.9 W5 Day 24 : synthetic monitoring (blackbox exporter).
|
||||
# Each parcours is probed every 5 min ; the 10m `for:` window means
|
||||
# an alert fires after 2 consecutive failures (per the roadmap
|
||||
# acceptance gate). `parcours` label carries the human-readable
|
||||
# name from blackbox_targets.yml so dashboards group cleanly.
|
||||
- name: veza_synthetic
|
||||
rules:
|
||||
- alert: SyntheticParcoursDown
|
||||
# probe_success is 0 when blackbox couldn't complete the probe.
|
||||
# The metric is emitted per (instance, parcours) so the alert
|
||||
# fires per-parcours, letting the on-call see exactly which
|
||||
# journey is broken without grepping logs.
|
||||
expr: probe_success{probe_kind="synthetic"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
page: "false"
|
||||
annotations:
|
||||
summary: "Synthetic parcours {{ $labels.parcours }} failing for 10m"
|
||||
description: |
|
||||
Blackbox exporter has been unable to complete the
|
||||
{{ $labels.parcours }} parcours against {{ $labels.instance }}
|
||||
for 10 minutes (≥ 2 consecutive failures). End-user impact
|
||||
is likely real — investigate the underlying component
|
||||
BEFORE the related per-component alert fires.
|
||||
runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down"
|
||||
|
||||
- alert: SyntheticAuthLoginDown
|
||||
# Login is the gate for everything else ; a single 10m blip
|
||||
# is critical. Pages.
|
||||
expr: probe_success{parcours="auth_login"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
page: "true"
|
||||
annotations:
|
||||
summary: "Synthetic auth_login down — login surface is broken"
|
||||
description: |
|
||||
The auth_login synthetic parcours has failed for 10+ minutes.
|
||||
Real users cannot log in. Page now.
|
||||
runbook_url: "https://veza.fr/runbooks/synthetic-parcours-down"
|
||||
|
||||
- alert: SyntheticProbeSlow
|
||||
# Probe latency budget : 5s for HTTP, 8s for the heavier ones.
|
||||
# When real-user latency degrades, blackbox is the canary.
|
||||
expr: probe_duration_seconds{probe_kind="synthetic"} > 8
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
page: "false"
|
||||
annotations:
|
||||
summary: "Synthetic parcours {{ $labels.parcours }} > 8s for 15m"
|
||||
description: |
|
||||
Probe duration exceeded 8 seconds for the past 15 minutes.
|
||||
Real users are likely seeing visible latency. Cross-check
|
||||
the SLO burn-rate alerts ; if those are quiet but this
|
||||
fires, the issue is in the synthetic-only path (DNS,
|
||||
external dependency).
|
||||
|
|
|
|||
89
config/prometheus/blackbox_targets.yml
Normal file
89
config/prometheus/blackbox_targets.yml
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
# Prometheus blackbox scrape config — synthetic monitoring of the
|
||||
# 6 parcours from v1.0.9 W5 Day 24.
|
||||
#
|
||||
# Probed every 5 minutes ; alerts fire after 2 consecutive failures.
|
||||
# This file is sourced by the main prometheus.yml :
|
||||
#
|
||||
# scrape_configs:
|
||||
# - job_name: 'blackbox'
|
||||
# file_sd_configs:
|
||||
# - files:
|
||||
# - /etc/prometheus/blackbox_targets.yml
|
||||
# metrics_path: /probe
|
||||
# relabel_configs:
|
||||
# - source_labels: [__address__]
|
||||
# target_label: __param_target
|
||||
# - source_labels: [__param_target]
|
||||
# target_label: instance
|
||||
# - target_label: __address__
|
||||
# replacement: blackbox-exporter.lxd:9115
|
||||
#
|
||||
# Each entry below carries a `module` label that maps to a
|
||||
# blackbox.yml module name AND a `parcours` label so Grafana can
|
||||
# group / filter. Prometheus passes module + target through the
|
||||
# query string when it scrapes blackbox.
|
||||
|
||||
# Parcours 1 — register / verify / login
|
||||
# (Reachability of the auth surface ; multi-step register-then-verify
|
||||
# requires a synthetic-client binary, tracked as follow-up.)
|
||||
- targets:
|
||||
- https://staging.veza.fr/api/v1/auth/login
|
||||
labels:
|
||||
module: http_status_envelope
|
||||
parcours: auth_login
|
||||
probe_kind: synthetic
|
||||
|
||||
# Parcours 2 — login → search → play first
|
||||
- targets:
|
||||
- https://staging.veza.fr/api/v1/search?q=test
|
||||
labels:
|
||||
module: http_search
|
||||
parcours: search
|
||||
probe_kind: synthetic
|
||||
|
||||
# Parcours 3 — login → upload tiny audio → poll status
|
||||
# Approximated by reaching the upload-config endpoint ; the actual
|
||||
# upload requires auth + file body which blackbox can't model.
|
||||
- targets:
|
||||
- https://staging.veza.fr/api/v1/upload/config
|
||||
labels:
|
||||
module: http_2xx
|
||||
parcours: upload_init
|
||||
probe_kind: synthetic
|
||||
|
||||
# Parcours 4 — login → browse marketplace → add to cart
|
||||
# Approximated by reaching the marketplace listing endpoint.
|
||||
- targets:
|
||||
- https://staging.veza.fr/api/v1/marketplace/products?limit=5
|
||||
labels:
|
||||
module: http_marketplace
|
||||
parcours: marketplace_list
|
||||
probe_kind: synthetic
|
||||
|
||||
# Parcours 5 — WebSocket chat connect + send message
|
||||
# TCP-only probe : confirms the listener is up. The full handshake +
|
||||
# auth + send round-trip needs the synthetic-client binary.
|
||||
- targets:
|
||||
- staging.veza.fr:443
|
||||
labels:
|
||||
module: tcp_websocket
|
||||
parcours: chat_websocket
|
||||
probe_kind: synthetic
|
||||
|
||||
# Parcours 6 — live stream metadata fetch
|
||||
- targets:
|
||||
- https://staging.veza.fr/api/v1/streams/active
|
||||
labels:
|
||||
module: http_2xx
|
||||
parcours: live_streams
|
||||
probe_kind: synthetic
|
||||
|
||||
# Bonus — public status page health (covers the /api/v1/status
|
||||
# response shape so a Cachet/statuspage.io consumer doesn't depend
|
||||
# on a hand-pinged check).
|
||||
- targets:
|
||||
- https://staging.veza.fr/api/v1/status
|
||||
labels:
|
||||
module: http_status_envelope
|
||||
parcours: status_endpoint
|
||||
probe_kind: synthetic
|
||||
|
|
@ -47,3 +47,114 @@ Output: `veza-backend-api/migrations/baseline_v0601.sql`
|
|||
3. Write idempotent SQL when possible (e.g. `IF NOT EXISTS`)
|
||||
4. Test locally before committing
|
||||
5. Run `squash_migrations.sh` to update the baseline for the release
|
||||
|
||||
## Expand-contract discipline (W5+ deploy pipeline contract)
|
||||
|
||||
> **TL;DR** — every migration must be **backward-compatible** with the
|
||||
> previous deploy's binary. No `DROP COLUMN`, no `ALTER ... NOT NULL`,
|
||||
> no `RENAME` in step 1. Schema evolution happens across **multiple
|
||||
> deploys**, not in one.
|
||||
|
||||
### Why this matters
|
||||
|
||||
The blue/green deploy pipeline (`infra/ansible/playbooks/deploy_app.yml`)
|
||||
makes rollback trivial at the **app layer**: HAProxy flips back to
|
||||
the previous color, ~5 seconds wall-clock, no data lost. But the
|
||||
**database** doesn't have colors. Migrations apply once, against the
|
||||
shared postgres container, and stay applied across the rollback.
|
||||
|
||||
If a deploy adds a non-nullable column and the rollback tries to insert
|
||||
a row without that column, the insert fails. The rollback button is
|
||||
broken — the previous binary now crashes against the post-migration
|
||||
schema.
|
||||
|
||||
The fix isn't to make the pipeline smarter. It's to make migrations
|
||||
forward-AND-backward compatible by construction.
|
||||
|
||||
### The expand-contract pattern (3 deploys per "destructive" change)
|
||||
|
||||
**Step 1 (deploy N) — Expand**: add the new shape **alongside** the
|
||||
old. Both binaries (old + new) work.
|
||||
|
||||
```sql
|
||||
-- migration NNN_add_user_email_verified.sql
|
||||
ALTER TABLE users ADD COLUMN email_verified BOOLEAN;
|
||||
-- nullable, no default — the old binary doesn't know about it.
|
||||
-- the new binary writes true/false on signup ; reads coalesce NULL → false.
|
||||
```
|
||||
|
||||
**Step 2 (deploy N+1) — Backfill**: once Step 1 is stable in prod
|
||||
(≥ 1 week, no rollbacks needed), backfill existing rows.
|
||||
|
||||
```sql
|
||||
-- migration NNN+1_backfill_user_email_verified.sql
|
||||
UPDATE users SET email_verified = false WHERE email_verified IS NULL;
|
||||
```
|
||||
|
||||
**Step 3 (deploy N+2) — Contract**: once the backfill is in, add the
|
||||
constraint. The old binary (still write-coalescing NULL → false) keeps
|
||||
working ; the new binary uses `NOT NULL` knowledge.
|
||||
|
||||
```sql
|
||||
-- migration NNN+2_user_email_verified_not_null.sql
|
||||
ALTER TABLE users ALTER COLUMN email_verified SET NOT NULL;
|
||||
ALTER TABLE users ALTER COLUMN email_verified SET DEFAULT false;
|
||||
```
|
||||
|
||||
After Step 3 is stable, you can rollback exactly **one** deploy without
|
||||
breakage. Rolling back beyond Step 1 is no longer safe — that's the
|
||||
expected consequence of expand-contract.
|
||||
|
||||
### Allowed in a single deploy
|
||||
|
||||
| Change | Safe in one deploy? |
|
||||
| --------------------------------------- | ----------------------- |
|
||||
| `CREATE TABLE` | yes |
|
||||
| `CREATE INDEX CONCURRENTLY` | yes |
|
||||
| Add nullable column | yes |
|
||||
| Add column with constant default | yes (PG ≥ 11) |
|
||||
| Backfill UPDATE (idempotent) | yes |
|
||||
| `DROP INDEX CONCURRENTLY` | yes (read paths flex) |
|
||||
| `DROP TABLE` (if no recent code reads it) | with caution |
|
||||
|
||||
### NOT allowed in a single deploy
|
||||
|
||||
| Change | Why |
|
||||
| --------------------------------------- | -------------------------------------------- |
|
||||
| `DROP COLUMN` | rollback's binary still selects it |
|
||||
| `ALTER COLUMN ... NOT NULL` (no prior backfill) | rollback inserts NULL |
|
||||
| `ALTER COLUMN ... TYPE` | rollback's binary expects old type |
|
||||
| `RENAME COLUMN` | rollback's binary still references old name |
|
||||
| `RENAME TABLE` | rollback queries old name |
|
||||
|
||||
### Reviewer checklist (PRs touching `veza-backend-api/migrations/`)
|
||||
|
||||
- [ ] Migration is **forward-only** (GORM doesn't run rollback SQL).
|
||||
- [ ] Migration is **idempotent** (re-running on an already-migrated
|
||||
DB is a no-op — `IF NOT EXISTS`, `ON CONFLICT DO NOTHING`, etc.).
|
||||
- [ ] No `DROP COLUMN`, `ALTER ... NOT NULL`, `RENAME` (or, if there
|
||||
is, the PR description references the prior backfill PRs and
|
||||
explains why this is the contract step).
|
||||
- [ ] If the migration takes a heavy lock (eg `ALTER TABLE` rewriting),
|
||||
use `CREATE INDEX CONCURRENTLY` or split.
|
||||
- [ ] App code changes assume both old and new schema are valid.
|
||||
|
||||
### When you must violate the rule (incident)
|
||||
|
||||
Sometimes a hot incident demands a destructive change ASAP and rollback
|
||||
is acceptable risk. In that case:
|
||||
|
||||
1. Tag the PR with `migration:destructive`.
|
||||
2. Document in the PR body what the rollback procedure is (manual
|
||||
SQL to recreate the dropped column, etc.).
|
||||
3. Get a second pair of eyes on the migration before merge.
|
||||
4. Block the corresponding rollback workflow for that env until
|
||||
you've verified the new schema is sticking.
|
||||
|
||||
### Future hardening (not in v1.0.x)
|
||||
|
||||
A `squawk` linter step in `.forgejo/workflows/ci.yml` could scan
|
||||
`veza-backend-api/migrations/*.sql` and fail on `DROP COLUMN`,
|
||||
`ALTER ... NOT NULL`, `RENAME`. The discipline above is the v1.0
|
||||
answer ; tooling lands when the hand-rolled discipline starts
|
||||
missing things.
|
||||
|
|
|
|||
253
docs/RUNBOOK_ROLLBACK.md
Normal file
253
docs/RUNBOOK_ROLLBACK.md
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
# Runbook — rollback a Veza deploy
|
||||
|
||||
Three rollback paths, ordered from fastest to slowest. Pick based on
|
||||
what's still alive and what you're rolling back from.
|
||||
|
||||
| Path | Time | Use when |
|
||||
| --------------------- | ---- | -------------------------------------------------- |
|
||||
| 1. HAProxy fast-flip | ~5s | The previous color's containers are still alive. |
|
||||
| 2. Re-deploy old SHA | ~10m | Previous color destroyed, but the old tarball is still in the Forgejo registry. |
|
||||
| 3. Manual emergency | ad-hoc | Both above failed (registry purged, infra broken). |
|
||||
|
||||
> **Before you rollback, breathe and read this first.** The default
|
||||
> instinct under fire is "smash the rollback button". Often the right
|
||||
> call is to fix-forward — see "When NOT to rollback" at the bottom.
|
||||
|
||||
---
|
||||
|
||||
## Decision flowchart
|
||||
|
||||
```
|
||||
Did the new color come up at all?
|
||||
│
|
||||
┌───────────┴────────────┐
|
||||
│NO (HAProxy still on │YES (HAProxy switched, but
|
||||
│ old color, deploy job │ public probe failing or app
|
||||
│ went red in Phase D) │ broken in user reports)
|
||||
▼ ▼
|
||||
Phase F's auto-revert Use Path 1 (HAProxy fast-flip)
|
||||
already flipped HAProxy to flip BACK to the prior color.
|
||||
for you. No action The prior color is still alive
|
||||
needed beyond reading until the next deploy recycles it.
|
||||
logs.
|
||||
If the prior color was already
|
||||
cleaned up, use Path 2.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Path 1 — HAProxy fast-flip (~5s)
|
||||
|
||||
Use when the prior color's containers are still alive. Triggered via
|
||||
the `Veza rollback` workflow with `mode=fast`.
|
||||
|
||||
### Pre-checks
|
||||
|
||||
```bash
|
||||
# What's the current active color?
|
||||
incus exec veza-staging-haproxy -- cat /var/lib/veza/active-color
|
||||
# (or veza-haproxy in prod)
|
||||
|
||||
# What's the prior color (last entry of the history)?
|
||||
incus exec veza-staging-haproxy -- head -2 /var/lib/veza/active-color.history
|
||||
|
||||
# Are the prior color's containers RUNNING?
|
||||
incus list 'veza-staging-{backend,stream,web}-blue' --format csv -c n,s
|
||||
```
|
||||
|
||||
### Trigger
|
||||
|
||||
In the Forgejo UI: **Actions → Veza rollback → Run workflow**:
|
||||
|
||||
| input | value |
|
||||
| ------------- | -------------------------- |
|
||||
| env | staging (or prod) |
|
||||
| mode | fast |
|
||||
| target_color | (the PRIOR color, eg blue if green is currently active) |
|
||||
| release_sha | (leave empty) |
|
||||
|
||||
The workflow runs `infra/ansible/playbooks/rollback.yml -e mode=fast
|
||||
-e target_color=blue` which :
|
||||
|
||||
1. Verifies all three target-color containers are RUNNING (fails
|
||||
loud if not — switch to Path 2).
|
||||
2. Re-templates `haproxy.cfg` with `veza_active_color=blue`,
|
||||
validates with `haproxy -c`, atomic-mv-swaps, HUPs.
|
||||
3. Updates `/var/lib/veza/active-color`.
|
||||
|
||||
Wall time: ~5s. Zero connection drop (HAProxy reload is graceful).
|
||||
|
||||
### Post-rollback
|
||||
|
||||
- Verify externally: `curl https://staging.veza.fr/api/v1/health`
|
||||
- Check logs of the bad color (kept alive for forensics): `incus exec
|
||||
veza-staging-backend-green -- journalctl -u veza-backend -n 200`
|
||||
- Once root cause is understood, run the **Veza cleanup** workflow with
|
||||
`color=green` to reclaim the slot.
|
||||
|
||||
---
|
||||
|
||||
## Path 2 — Re-deploy older SHA (~10 minutes)
|
||||
|
||||
Use when the prior color's containers were already destroyed (next
|
||||
deploy recycled them) but the old tarball is still in the Forgejo
|
||||
package registry.
|
||||
|
||||
### Pre-checks
|
||||
|
||||
```bash
|
||||
# Pick the SHA you want to roll back TO.
|
||||
# Look at the active-color.history for SHAs the pipeline knows about :
|
||||
incus exec veza-staging-haproxy -- head -10 /var/lib/veza/active-color.history
|
||||
|
||||
# Or `git log --oneline main` for any commit ; just confirm the
|
||||
# tarball still exists in the registry (default retention 30 SHAs
|
||||
# per component) :
|
||||
curl -fsSL -I -H "Authorization: token $TOKEN" \
|
||||
"https://forgejo.veza.fr/api/packages/talas/generic/veza-backend/$SHA/veza-backend-$SHA.tar.zst"
|
||||
```
|
||||
|
||||
### Trigger
|
||||
|
||||
In the Forgejo UI: **Actions → Veza rollback → Run workflow**:
|
||||
|
||||
| input | value |
|
||||
| ------------ | ------------------------------- |
|
||||
| env | staging (or prod) |
|
||||
| mode | full |
|
||||
| target_color | (leave empty) |
|
||||
| release_sha | the 40-char SHA you're rolling TO |
|
||||
|
||||
The workflow runs `playbooks/rollback.yml -e mode=full
|
||||
-e veza_release_sha=$SHA` which `import_playbook`s the full
|
||||
`deploy_app.yml` pipeline. Same Phase A → Phase F sequence as a
|
||||
normal deploy, but with the older SHA.
|
||||
|
||||
Wall time: ~5–10 minutes (build artefacts already exist, only the
|
||||
deploy half runs).
|
||||
|
||||
### Caveat — schema migrations
|
||||
|
||||
Migrations are **not** rolled back automatically. The schema after
|
||||
`Path 2` is the post-deploy schema, not the pre-deploy schema.
|
||||
Per **MIGRATIONS.md**'s expand-contract discipline, this should be
|
||||
fine for one deploy back. If it isn't (i.e., the failed deploy
|
||||
included a destructive migration), see **Path 3**.
|
||||
|
||||
---
|
||||
|
||||
## Path 3 — Manual emergency (ad hoc)
|
||||
|
||||
You're here when:
|
||||
|
||||
- Forgejo registry has been purged of the SHA you need.
|
||||
- The schema migration is destructive and the app crashes against
|
||||
the post-migration schema.
|
||||
- The Incus host itself is in a bad state.
|
||||
|
||||
### Tarball missing — rebuild and push manually
|
||||
|
||||
```bash
|
||||
# Build the artefact locally (you'll need the toolchain) :
|
||||
cd veza-backend-api
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -trimpath -ldflags "-s -w" \
|
||||
-o ./bin/veza-api ./cmd/api/main.go
|
||||
tar --use-compress-program=zstd -cf "/tmp/veza-backend-$SHA.tar.zst" \
|
||||
-C ./bin veza-api migrate_tool
|
||||
|
||||
# Push to the registry :
|
||||
curl -fsSL --fail-with-body -X PUT \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
--upload-file "/tmp/veza-backend-$SHA.tar.zst" \
|
||||
"https://forgejo.veza.fr/api/packages/talas/generic/veza-backend/$SHA/veza-backend-$SHA.tar.zst"
|
||||
|
||||
# Then run Path 2.
|
||||
```
|
||||
|
||||
### Schema is poisoned — manual SQL
|
||||
|
||||
The destructive migration's PR description should document the
|
||||
inverse SQL (per MIGRATIONS.md "When you must violate the rule").
|
||||
Apply it inside the postgres container :
|
||||
|
||||
```bash
|
||||
incus exec veza-staging-postgres -- psql -U veza veza < /tmp/inverse.sql
|
||||
```
|
||||
|
||||
Then run Path 2 to deploy the older binary.
|
||||
|
||||
### Incus host broken — rollback ZFS snapshot
|
||||
|
||||
`deploy_data.yml` snapshots every data container's dataset before
|
||||
mutating anything (`<dataset>@pre-deploy-<sha>`). To restore :
|
||||
|
||||
```bash
|
||||
# First, stop the container :
|
||||
incus stop veza-staging-postgres
|
||||
|
||||
# Roll the dataset back to the pre-deploy snapshot :
|
||||
zfs rollback -r rpool/incus/containers/veza-staging-postgres@pre-deploy-<sha>
|
||||
|
||||
# Restart the container :
|
||||
incus start veza-staging-postgres
|
||||
```
|
||||
|
||||
This loses any data written after the snapshot. Last-resort only.
|
||||
|
||||
---
|
||||
|
||||
## When NOT to rollback
|
||||
|
||||
- **Single user reports a bug**. Triage first ; rolling back affects
|
||||
100% of users to fix something hitting <1%.
|
||||
- **Performance regression**. If the new SHA is up but slow, scale
|
||||
horizontally before rolling back. (Future Hetzner offload covers
|
||||
this ; for now, accept the regression and prep a fix-forward.)
|
||||
- **Cosmetic UI bug**. Hot-fix the frontend and let the deploy
|
||||
pipeline ship it as a normal commit.
|
||||
- **You're not on-call and didn't get a page**. Don't rollback "to
|
||||
be safe". The on-call's call.
|
||||
|
||||
The rollback button's existence isn't a license to use it
|
||||
preemptively. Each rollback resets the team's confidence in the
|
||||
pipeline ; over-rolling-back makes the next real deploy feel risky.
|
||||
|
||||
---
|
||||
|
||||
## Post-incident
|
||||
|
||||
After ANY rollback (path 1, 2, or 3) :
|
||||
|
||||
1. Update **docs/POSTMORTEMS.md** (or `docs/runbooks/incidents/<date>.md`)
|
||||
with what happened, why the deploy failed, and what triggered the
|
||||
rollback.
|
||||
2. File the fix as a normal PR ; do NOT skip CI.
|
||||
3. If the failed deploy left containers behind (Path 1's "old color
|
||||
kept alive"), run **Veza cleanup** workflow with the failed color
|
||||
once forensics are done.
|
||||
4. Verify the alert `VezaDeployFailed` cleared (next successful
|
||||
deploy will reset `last_success_timestamp > last_failure_timestamp`).
|
||||
|
||||
---
|
||||
|
||||
## Workflows referenced
|
||||
|
||||
- `.forgejo/workflows/deploy.yml` — push:main → staging, tag → prod.
|
||||
- `.forgejo/workflows/rollback.yml` — workflow_dispatch only, modes
|
||||
fast and full.
|
||||
- `.forgejo/workflows/cleanup-failed.yml` — workflow_dispatch only,
|
||||
destroys a specific color's app containers.
|
||||
|
||||
## Playbooks referenced
|
||||
|
||||
- `infra/ansible/playbooks/deploy_app.yml`
|
||||
- `infra/ansible/playbooks/rollback.yml`
|
||||
- `infra/ansible/playbooks/cleanup_failed.yml`
|
||||
- `infra/ansible/playbooks/deploy_data.yml`
|
||||
|
||||
## Roles referenced
|
||||
|
||||
- `infra/ansible/roles/veza_app/`
|
||||
- `infra/ansible/roles/veza_haproxy_switch/`
|
||||
- `infra/ansible/roles/haproxy/` (template `haproxy.cfg.j2` with
|
||||
blue/green topology toggle).
|
||||
|
|
@ -112,6 +112,14 @@ all:
|
|||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
# v1.0.9 W5 Day 24 — synthetic monitoring runner. Should sit on a
|
||||
# host external to the prod cluster ; lab phase-1 colocates it.
|
||||
blackbox_exporter:
|
||||
hosts:
|
||||
blackbox-exporter:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
# v1.0.9 W3 Day 12: distributed MinIO with EC:2. 4 Incus containers,
|
||||
# each providing one drive ; single erasure set tolerates 2 simultaneous
|
||||
# node failures.
|
||||
|
|
|
|||
|
|
@ -1,21 +1,60 @@
|
|||
# Prod inventory — single R720 (self-hosted Incus) at launch, with
|
||||
# Hetzner debordement planned post-launch. ROADMAP_V1.0_LAUNCH.md §2
|
||||
# documents the COMPRESSED HA stance: real multi-host HA arrives
|
||||
# v1.1+; v1.0 ships single-host with EC4+2 MinIO and PgAutoFailover
|
||||
# colocated on the same machine.
|
||||
# Prod inventory — single R720 (self-hosted Incus) at v1.0 launch,
|
||||
# Hetzner debordement post-launch. ROADMAP_V1.0_LAUNCH.md §2 documents
|
||||
# the COMPRESSED HA stance : real multi-host HA arrives v1.1+ ; v1.0
|
||||
# ships single-host with EC4+2 MinIO + PgAutoFailover colocated.
|
||||
#
|
||||
# Real ansible_host left as TODO until DNS (EX-5) is live. Use
|
||||
# ssh-config aliases or fill these in once `api.veza.fr` resolves.
|
||||
# Topology mirrors staging.yml (same shape, different prefix +
|
||||
# different network — see group_vars/prod.yml). Phase-2 (post v1.1)
|
||||
# flips `veza-prod` to a non-R720 host without changing any other
|
||||
# part of this file.
|
||||
#
|
||||
# Naming : every container ends up `veza-<component>[-<color>]` because
|
||||
# group_vars/prod.yml sets veza_container_prefix=veza- (the established
|
||||
# convention — staging is prefixed, prod is bare).
|
||||
all:
|
||||
hosts:
|
||||
veza-prod:
|
||||
ansible_host: TODO_PROD_IP
|
||||
ansible_host: 10.0.20.150
|
||||
ansible_user: ansible
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
children:
|
||||
incus_hosts:
|
||||
hosts:
|
||||
veza-prod:
|
||||
veza_prod:
|
||||
haproxy:
|
||||
hosts:
|
||||
veza-prod:
|
||||
veza-haproxy:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_app_backend:
|
||||
hosts:
|
||||
veza-backend-blue:
|
||||
veza-backend-green:
|
||||
veza-backend-tools: # ephemeral, Phase A only
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_app_stream:
|
||||
hosts:
|
||||
veza-stream-blue:
|
||||
veza-stream-green:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_app_web:
|
||||
hosts:
|
||||
veza-web-blue:
|
||||
veza-web-green:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_data:
|
||||
hosts:
|
||||
veza-postgres:
|
||||
veza-redis:
|
||||
veza-rabbitmq:
|
||||
veza-minio:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
|
|
|
|||
|
|
@ -1,20 +1,82 @@
|
|||
# Staging inventory — Hetzner Cloud host that mirrors prod topology
|
||||
# (Postgres + Redis + RabbitMQ + MinIO + backend/web/stream
|
||||
# containers) at a smaller scale, for pre-deploy validation.
|
||||
# Staging inventory — local R720 (same Incus daemon as the Forgejo
|
||||
# runner ; phase-1 simplification documented in group_vars/staging.yml).
|
||||
#
|
||||
# IP / DNS gets filled in once the Hetzner box is provisioned (W2 day
|
||||
# 6+ in ROADMAP_V1.0_LAUNCH.md). Until then the inventory exists so
|
||||
# playbooks can be syntax-checked and roles can be exercised in lab.
|
||||
# Connection model :
|
||||
# * `veza-staging` is the Incus host (R720 itself). Ansible
|
||||
# reaches it over SSH ; the runner has the right SSH key in
|
||||
# ~/.ssh/.
|
||||
# * Every other host in this inventory lives INSIDE that Incus
|
||||
# host as an LXC container. Ansible reaches them via the
|
||||
# `community.general.incus` connection plugin (no SSH-into-
|
||||
# containers needed) — see group vars under each child group.
|
||||
#
|
||||
# Container set :
|
||||
# * App tier — backend/stream/web in blue/green pairs (6
|
||||
# containers) + an ephemeral backend-tools used
|
||||
# by deploy_app.yml Phase A (migrations).
|
||||
# * Edge — haproxy (singleton, persistent across deploys).
|
||||
# * Data tier — postgres, redis, rabbitmq, minio (singletons,
|
||||
# state survives every deploy).
|
||||
#
|
||||
# Used by :
|
||||
# * .forgejo/workflows/deploy.yml (push:main → -i inventory/staging.yml)
|
||||
# * .forgejo/workflows/rollback.yml + cleanup-failed.yml
|
||||
# * Local debug : `ansible-playbook -i inventory/staging.yml \
|
||||
# playbooks/deploy_data.yml --check --diff \
|
||||
# --vault-password-file ~/.vault-pass`
|
||||
#
|
||||
# Naming : every container ends up `veza-staging-<component>[-<color>]`
|
||||
# because group_vars/staging.yml sets veza_container_prefix=veza-staging-.
|
||||
all:
|
||||
hosts:
|
||||
veza-staging:
|
||||
ansible_host: TODO_HETZNER_IP
|
||||
ansible_host: 10.0.20.150
|
||||
ansible_user: ansible
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
children:
|
||||
incus_hosts:
|
||||
hosts:
|
||||
veza-staging:
|
||||
veza_staging:
|
||||
haproxy:
|
||||
hosts:
|
||||
veza-staging:
|
||||
veza-staging-haproxy:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
# The 6 app containers + 1 ephemeral tools container. deploy_app.yml
|
||||
# selects the inactive color dynamically from the haproxy
|
||||
# container's /var/lib/veza/active-color file ; both blue and
|
||||
# green sit in inventory so either color is reachable when needed.
|
||||
veza_app_backend:
|
||||
hosts:
|
||||
veza-staging-backend-blue:
|
||||
veza-staging-backend-green:
|
||||
veza-staging-backend-tools: # ephemeral, Phase A only
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_app_stream:
|
||||
hosts:
|
||||
veza-staging-stream-blue:
|
||||
veza-staging-stream-green:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
veza_app_web:
|
||||
hosts:
|
||||
veza-staging-web-blue:
|
||||
veza-staging-web-green:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
# Data tier — never destroyed, only created if absent. ZFS
|
||||
# snapshots taken on every deploy as the safety net.
|
||||
veza_data:
|
||||
hosts:
|
||||
veza-staging-postgres:
|
||||
veza-staging-redis:
|
||||
veza-staging-rabbitmq:
|
||||
veza-staging-minio:
|
||||
vars:
|
||||
ansible_connection: community.general.incus
|
||||
ansible_python_interpreter: /usr/bin/python3
|
||||
|
|
|
|||
56
infra/ansible/playbooks/blackbox_exporter.yml
Normal file
56
infra/ansible/playbooks/blackbox_exporter.yml
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
# Synthetic monitoring playbook — provisions the blackbox-exporter
|
||||
# Incus container and lays down the role.
|
||||
#
|
||||
# v1.0.9 W5 Day 24.
|
||||
#
|
||||
# IMPORTANT : the blackbox exporter SHOULD run on a host that is
|
||||
# externally-routed (separate from the prod cluster) so a probe
|
||||
# failure reflects what an external user sees. v1.0 lab keeps it on
|
||||
# the same Incus host for simplicity ; phase-2 moves it off-box.
|
||||
#
|
||||
# Run with:
|
||||
# ansible-galaxy collection install community.general
|
||||
# ansible-playbook -i inventory/lab.yml playbooks/blackbox_exporter.yml
|
||||
---
|
||||
- name: Provision Incus container for blackbox exporter
|
||||
hosts: incus_hosts
|
||||
become: true
|
||||
gather_facts: true
|
||||
tasks:
|
||||
- name: Launch blackbox-exporter container
|
||||
ansible.builtin.shell:
|
||||
cmd: |
|
||||
set -e
|
||||
if ! incus info blackbox-exporter >/dev/null 2>&1; then
|
||||
incus launch images:ubuntu/22.04 blackbox-exporter
|
||||
for _ in $(seq 1 30); do
|
||||
if incus exec blackbox-exporter -- cloud-init status 2>/dev/null | grep -q "status: done"; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
incus exec blackbox-exporter -- apt-get update
|
||||
incus exec blackbox-exporter -- apt-get install -y python3 python3-apt
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: provision_result
|
||||
changed_when: "'incus launch' in provision_result.stdout"
|
||||
tags: [blackbox, provision]
|
||||
|
||||
- name: Refresh inventory
|
||||
ansible.builtin.meta: refresh_inventory
|
||||
|
||||
- name: Apply common baseline
|
||||
hosts: blackbox_exporter
|
||||
become: true
|
||||
gather_facts: true
|
||||
roles:
|
||||
- common
|
||||
|
||||
- name: Install + configure blackbox exporter
|
||||
hosts: blackbox_exporter
|
||||
become: true
|
||||
gather_facts: true
|
||||
roles:
|
||||
- blackbox_exporter
|
||||
93
infra/ansible/roles/blackbox_exporter/README.md
Normal file
93
infra/ansible/roles/blackbox_exporter/README.md
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
# `blackbox_exporter` role — synthetic monitoring runner
|
||||
|
||||
Single Incus container running Prometheus' `blackbox_exporter`. Probed by Prometheus every 5 minutes against the 6 user parcours from v1.0.9 W5 Day 24. Alerts fire after 2 consecutive failures (`for: 10m` × 5-min scrape = 2 cycles).
|
||||
|
||||
## Topology
|
||||
|
||||
```
|
||||
Prometheus :9090
|
||||
│ scrape every 5m
|
||||
▼
|
||||
┌─────────────────────────────┐
|
||||
│ blackbox-exporter.lxd:9115 │
|
||||
│ (this role) │
|
||||
└────────────┬────────────────┘
|
||||
│ probes (HTTP / TCP)
|
||||
┌─────────────────────┼─────────────────────┐
|
||||
▼ ▼ ▼
|
||||
staging.veza.fr/api/v1/auth/login /api/v1/search?q=test /api/v1/marketplace/products
|
||||
... ...
|
||||
```
|
||||
|
||||
The exporter SHOULD run on a host **external** to the prod cluster so probe failures reflect what an external user sees, not what an already-broken internal service hides. v1.0 lab phase-1 colocates it for simplicity ; phase-2 moves the container off-box.
|
||||
|
||||
## Probe modules (defined in `templates/blackbox.yml.j2`)
|
||||
|
||||
| Module | Used by parcours | What it asserts |
|
||||
| ---------------------- | ---------------------- | ------------------------------------------------------ |
|
||||
| `http_2xx` | upload_init, live_streams | Status code 200 or 204, TLS valid |
|
||||
| `http_status_envelope` | auth_login, status_endpoint | Body matches `"success":\s*true` |
|
||||
| `http_search` | search | Body matches `"tracks"` (seed data must include hits) |
|
||||
| `http_marketplace` | marketplace_list | 200 (no body assertion ; an empty array is valid) |
|
||||
| `tcp_websocket` | chat_websocket | TLS-wrapped TCP handshake completes |
|
||||
|
||||
Multi-step parcours that need session state (Register → Verify → Login, Login → Search → Play first result) are **out of scope** for blackbox. Tracked as a follow-up : a small Go binary that runs as a CronJob, walks the steps, and writes textfile-collector metrics to `/var/lib/node_exporter/textfile_collector/veza_synthetic.prom`.
|
||||
|
||||
## Defaults
|
||||
|
||||
| variable | default | meaning |
|
||||
| -------------------------- | ----------------------------- | ---------------------------------------- |
|
||||
| `blackbox_version` | `0.25.0` | Prometheus blackbox_exporter release |
|
||||
| `blackbox_listen_port` | `9115` | Prometheus default |
|
||||
| `blackbox_target_base_url` | `https://staging.veza.fr` | base URL the probes hit |
|
||||
|
||||
## Prometheus scrape config
|
||||
|
||||
`config/prometheus/blackbox_targets.yml` carries the 7 file-SD entries (6 parcours + status-endpoint bonus). Wire it in `prometheus.yml` :
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: blackbox
|
||||
file_sd_configs:
|
||||
- files: [/etc/prometheus/blackbox_targets.yml]
|
||||
metrics_path: /probe
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- source_labels: [module]
|
||||
target_label: __param_module
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter.lxd:9115
|
||||
```
|
||||
|
||||
## Alert rules
|
||||
|
||||
`config/prometheus/alert_rules.yml` group `veza_synthetic` :
|
||||
|
||||
- `SyntheticParcoursDown` — any parcours fails for 10 m → warning.
|
||||
- `SyntheticAuthLoginDown` — auth_login fails for 10 m → critical (page).
|
||||
- `SyntheticProbeSlow` — probe duration > 8 s for 15 m → warning.
|
||||
|
||||
## Operations
|
||||
|
||||
```bash
|
||||
# Service status :
|
||||
sudo systemctl status blackbox_exporter
|
||||
|
||||
# One-off probe (dev / debug) :
|
||||
curl 'http://blackbox-exporter.lxd:9115/probe?target=https://staging.veza.fr/api/v1/health&module=http_status_envelope'
|
||||
|
||||
# Live probe latency tail :
|
||||
curl -s http://blackbox-exporter.lxd:9115/metrics | grep probe_duration
|
||||
|
||||
# Tail the exporter log :
|
||||
sudo journalctl -u blackbox_exporter -f
|
||||
```
|
||||
|
||||
## What this role does NOT cover
|
||||
|
||||
- **Multi-step parcours.** Blackbox can't carry session cookies across probes ; the Register-then-Verify-then-Login flow needs a custom synthetic client. Tracked for v1.0.10.
|
||||
- **Status page.** Cachet/statuspage.io is a separate operator decision per the roadmap. The `/api/v1/status` endpoint is consumable by both.
|
||||
- **Off-box deploy.** Lab phase-1 runs the container on the same Incus host as the things it's probing. Phase-2 moves it off-cluster.
|
||||
20
infra/ansible/roles/blackbox_exporter/defaults/main.yml
Normal file
20
infra/ansible/roles/blackbox_exporter/defaults/main.yml
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# blackbox_exporter defaults — synthetic monitoring runner.
|
||||
# v1.0.9 W5 Day 24.
|
||||
#
|
||||
# Sits OUTSIDE the prod network (separate Incus host or off-box) so a
|
||||
# probe failure reflects what an external user sees, not what an
|
||||
# already-broken internal service hides. Six parcours per the roadmap,
|
||||
# probed every 5 min by Prometheus.
|
||||
---
|
||||
blackbox_version: "0.25.0"
|
||||
blackbox_arch: amd64
|
||||
|
||||
# Listener — Prometheus scrapes this on port 9115 (the blackbox_exporter
|
||||
# default).
|
||||
blackbox_listen_port: 9115
|
||||
|
||||
# Probe targets. The 6 parcours from the roadmap are mapped to simpler
|
||||
# blackbox probes here (HTTP 2xx) ; the multi-step parcours that need
|
||||
# session state (Register → Login → Search) are out of scope for
|
||||
# blackbox itself and tracked as a follow-up (synthetic-client binary).
|
||||
blackbox_target_base_url: "https://staging.veza.fr"
|
||||
6
infra/ansible/roles/blackbox_exporter/handlers/main.yml
Normal file
6
infra/ansible/roles/blackbox_exporter/handlers/main.yml
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
- name: Restart blackbox_exporter
|
||||
ansible.builtin.systemd:
|
||||
name: blackbox_exporter
|
||||
state: restarted
|
||||
daemon_reload: true
|
||||
89
infra/ansible/roles/blackbox_exporter/tasks/main.yml
Normal file
89
infra/ansible/roles/blackbox_exporter/tasks/main.yml
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
# blackbox_exporter role — installs the Prometheus blackbox exporter
|
||||
# from the official tarball, drops the systemd unit, renders the probe
|
||||
# config. Idempotent.
|
||||
---
|
||||
- name: Ensure /opt/blackbox_exporter exists
|
||||
ansible.builtin.file:
|
||||
path: /opt/blackbox_exporter
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
tags: [blackbox, install]
|
||||
|
||||
- name: Check installed blackbox_exporter version
|
||||
ansible.builtin.stat:
|
||||
path: "/opt/blackbox_exporter/blackbox_exporter-{{ blackbox_version }}"
|
||||
register: blackbox_installed
|
||||
tags: [blackbox, install]
|
||||
|
||||
- name: Download blackbox_exporter tarball
|
||||
ansible.builtin.get_url:
|
||||
url: "https://github.com/prometheus/blackbox_exporter/releases/download/v{{ blackbox_version }}/blackbox_exporter-{{ blackbox_version }}.linux-{{ blackbox_arch }}.tar.gz"
|
||||
dest: "/tmp/blackbox_exporter-{{ blackbox_version }}.tar.gz"
|
||||
mode: "0644"
|
||||
when: not blackbox_installed.stat.exists
|
||||
tags: [blackbox, install]
|
||||
|
||||
- name: Extract blackbox_exporter into versioned slot
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/blackbox_exporter-{{ blackbox_version }}.tar.gz"
|
||||
dest: /opt/blackbox_exporter
|
||||
remote_src: true
|
||||
creates: "/opt/blackbox_exporter/blackbox_exporter-{{ blackbox_version }}.linux-{{ blackbox_arch }}"
|
||||
when: not blackbox_installed.stat.exists
|
||||
tags: [blackbox, install]
|
||||
|
||||
- name: Symlink /usr/local/bin/blackbox_exporter → versioned binary
|
||||
ansible.builtin.file:
|
||||
src: "/opt/blackbox_exporter/blackbox_exporter-{{ blackbox_version }}.linux-{{ blackbox_arch }}/blackbox_exporter"
|
||||
dest: /usr/local/bin/blackbox_exporter
|
||||
state: link
|
||||
force: true
|
||||
notify: Restart blackbox_exporter
|
||||
tags: [blackbox, install]
|
||||
|
||||
- name: Create blackbox system user
|
||||
ansible.builtin.user:
|
||||
name: blackbox
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
create_home: false
|
||||
tags: [blackbox, install]
|
||||
|
||||
- name: Ensure /etc/blackbox_exporter exists
|
||||
ansible.builtin.file:
|
||||
path: /etc/blackbox_exporter
|
||||
state: directory
|
||||
owner: root
|
||||
group: blackbox
|
||||
mode: "0750"
|
||||
tags: [blackbox, config]
|
||||
|
||||
- name: Render blackbox.yml
|
||||
ansible.builtin.template:
|
||||
src: blackbox.yml.j2
|
||||
dest: /etc/blackbox_exporter/blackbox.yml
|
||||
owner: root
|
||||
group: blackbox
|
||||
mode: "0640"
|
||||
notify: Restart blackbox_exporter
|
||||
tags: [blackbox, config]
|
||||
|
||||
- name: Render systemd unit
|
||||
ansible.builtin.template:
|
||||
src: blackbox_exporter.service.j2
|
||||
dest: /etc/systemd/system/blackbox_exporter.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Restart blackbox_exporter
|
||||
tags: [blackbox, service]
|
||||
|
||||
- name: Enable + start blackbox_exporter
|
||||
ansible.builtin.systemd:
|
||||
name: blackbox_exporter
|
||||
state: started
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
tags: [blackbox, service]
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
# Managed by Ansible — do not edit by hand.
|
||||
# Probe modules used by Prometheus' blackbox scrape config.
|
||||
# v1.0.9 W5 Day 24.
|
||||
|
||||
modules:
|
||||
# http_2xx — vanilla HTTP probe, accepts any 2xx response.
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
valid_status_codes: [200, 204]
|
||||
method: GET
|
||||
no_follow_redirects: false
|
||||
fail_if_ssl: false
|
||||
fail_if_not_ssl: true # synthetic monitoring runs against staging w/ TLS
|
||||
|
||||
# http_status_envelope — accept the {success: true, ...} body shape.
|
||||
# Used for /api/v1/health which wraps the verdict.
|
||||
http_status_envelope:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
valid_status_codes: [200]
|
||||
method: GET
|
||||
fail_if_body_not_matches_regexp:
|
||||
- '"success"\s*:\s*true'
|
||||
|
||||
# http_search — POST-less search probe. The synthetic user hits
|
||||
# /api/v1/search?q=test ; staging seed data must include something
|
||||
# for that query to return non-empty.
|
||||
http_search:
|
||||
prober: http
|
||||
timeout: 8s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
valid_status_codes: [200]
|
||||
method: GET
|
||||
fail_if_body_not_matches_regexp:
|
||||
- '"tracks"'
|
||||
|
||||
# http_marketplace — same shape, different endpoint.
|
||||
http_marketplace:
|
||||
prober: http
|
||||
timeout: 8s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
valid_status_codes: [200]
|
||||
method: GET
|
||||
|
||||
# tcp_websocket — bare TCP connect to the WS port to verify the
|
||||
# listener is alive. Doesn't speak the WS protocol — for that the
|
||||
# synthetic-client binary (out of scope for this role) handles
|
||||
# connect+send+receive.
|
||||
tcp_websocket:
|
||||
prober: tcp
|
||||
timeout: 5s
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
tls: true
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
# Managed by Ansible — do not edit by hand.
|
||||
[Unit]
|
||||
Description=Prometheus Blackbox Exporter
|
||||
Documentation=https://github.com/prometheus/blackbox_exporter
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=blackbox
|
||||
Group=blackbox
|
||||
ExecStart=/usr/local/bin/blackbox_exporter \
|
||||
--config.file=/etc/blackbox_exporter/blackbox.yml \
|
||||
--web.listen-address=:{{ blackbox_listen_port }}
|
||||
Restart=on-failure
|
||||
RestartSec=5s
|
||||
LimitNOFILE=65535
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
PrivateTmp=true
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
ProtectControlGroups=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
70
scripts/observability/scan-failed-colors.sh
Executable file
70
scripts/observability/scan-failed-colors.sh
Executable file
|
|
@ -0,0 +1,70 @@
|
|||
#!/usr/bin/env bash
|
||||
# scan-failed-colors.sh — emit veza_deploy_failed_color_alive textfile
|
||||
# metrics from `incus list`. Designed to be called every minute by a
|
||||
# systemd timer on the Incus host ; node_exporter's textfile collector
|
||||
# picks the file up.
|
||||
#
|
||||
# A "failed-deploy color" is defined here as: an inactive color
|
||||
# (NOT the one in /var/lib/veza/active-color in the haproxy container)
|
||||
# whose containers are present and RUNNING. In normal operation, the
|
||||
# inactive color exists exactly because the LAST deploy DIDN'T fail
|
||||
# (it became the new prior color). The signal we want is when the
|
||||
# inactive color outlives its useful window — Phase E.fail kept it
|
||||
# alive for forensics and the operator forgot to clean up.
|
||||
#
|
||||
# Heuristic: emit the metric whenever an inactive color exists. The
|
||||
# alert (VezaFailedColorAlive) is gated by `for: 24h` which converts
|
||||
# "color is inactive" into "color has been inactive for >24h", which
|
||||
# is the actual page-worthy signal.
|
||||
#
|
||||
# Usage:
|
||||
# PREFIX=veza-staging- /opt/veza/scripts/scan-failed-colors.sh
|
||||
# Output:
|
||||
# /var/lib/node_exporter/textfile_collector/veza_deploy_failed_colors.prom
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PREFIX="${PREFIX:-veza-}"
|
||||
ENV="${ENV:-$(echo "$PREFIX" | sed -E 's/^veza-?//;s/-$//')}"
|
||||
HAPROXY_CT="${PREFIX}haproxy"
|
||||
TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
|
||||
OUT="${TEXTFILE_DIR}/veza_deploy_failed_colors.prom"
|
||||
|
||||
mkdir -p "$TEXTFILE_DIR"
|
||||
|
||||
# Read active color from the HAProxy container ; default blue if file
|
||||
# missing (first-ever deploy, no rollback history).
|
||||
if incus exec "$HAPROXY_CT" -- /bin/true 2>/dev/null; then
|
||||
ACTIVE=$(incus exec "$HAPROXY_CT" -- cat /var/lib/veza/active-color 2>/dev/null | tr -d '[:space:]' || echo blue)
|
||||
else
|
||||
ACTIVE=blue
|
||||
fi
|
||||
[ -z "$ACTIVE" ] && ACTIVE=blue
|
||||
INACTIVE=$([ "$ACTIVE" = "blue" ] && echo green || echo blue)
|
||||
|
||||
# Emit a single sample per color. A 1 means "this inactive color has
|
||||
# at least one app container alive" ; 0 (or absence) means clean.
|
||||
TMPFILE="${OUT}.tmp"
|
||||
{
|
||||
echo "# HELP veza_deploy_failed_color_alive 1 if the inactive color has live app containers."
|
||||
echo "# TYPE veza_deploy_failed_color_alive gauge"
|
||||
for COLOR in blue green; do
|
||||
if [ "$COLOR" = "$ACTIVE" ]; then
|
||||
# Active color is by definition NOT a failed-deploy color.
|
||||
echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} 0"
|
||||
continue
|
||||
fi
|
||||
ALIVE=0
|
||||
for COMP in backend stream web; do
|
||||
CT="${PREFIX}${COMP}-${COLOR}"
|
||||
STATE=$(incus list "$CT" -c s --format csv 2>/dev/null || true)
|
||||
if [ "$STATE" = "RUNNING" ]; then
|
||||
ALIVE=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
echo "veza_deploy_failed_color_alive{env=\"$ENV\",color=\"$COLOR\"} $ALIVE"
|
||||
done
|
||||
} > "$TMPFILE"
|
||||
|
||||
mv -f "$TMPFILE" "$OUT"
|
||||
Loading…
Reference in a new issue