diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml index 4ac357d47..fc70bf53a 100644 --- a/infra/ansible/inventory/lab.yml +++ b/infra/ansible/inventory/lab.yml @@ -72,6 +72,16 @@ all: # references this group to point each sentinel at it. hosts: redis-1: + # v1.0.9 — phase-1 self-hosted edge cache fronting the MinIO cluster. + # Single container colocated on the lab host. Phase-2 (W3+) adds a + # second node + GeoDNS ; phase-3 only wires Bunny.net via the + # existing CDN_* env vars. + nginx_cache: + hosts: + nginx-cache: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 # v1.0.9 W3 Day 12: distributed MinIO with EC:2. 4 Incus containers, # each providing one drive ; single erasure set tolerates 2 simultaneous # node failures. diff --git a/infra/ansible/playbooks/nginx_proxy_cache.yml b/infra/ansible/playbooks/nginx_proxy_cache.yml new file mode 100644 index 000000000..004869ac0 --- /dev/null +++ b/infra/ansible/playbooks/nginx_proxy_cache.yml @@ -0,0 +1,54 @@ +# Phase-1 edge cache playbook — provisions a single Incus container +# `nginx-cache` and lays down nginx + the proxy_cache config in front +# of the MinIO cluster. +# +# v1.0.9 — phase-1 alternative to a third-party CDN. Pairs with the +# existing `cdn_service.go` which stays inert (CDN_ENABLED=false) until +# traffic justifies a third-party edge. +# +# Run with: +# ansible-galaxy collection install community.general +# ansible-playbook -i inventory/lab.yml playbooks/nginx_proxy_cache.yml +--- +- name: Provision Incus container for the edge cache + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Launch nginx-cache + ansible.builtin.shell: + cmd: | + set -e + if ! incus info nginx-cache >/dev/null 2>&1; then + incus launch images:ubuntu/22.04 nginx-cache + for _ in $(seq 1 30); do + if incus exec nginx-cache -- cloud-init status 2>/dev/null | grep -q "status: done"; then + break + fi + sleep 1 + done + incus exec nginx-cache -- apt-get update + incus exec nginx-cache -- apt-get install -y python3 python3-apt + fi + args: + executable: /bin/bash + register: provision_result + changed_when: "'incus launch' in provision_result.stdout" + tags: [nginx_cache, provision] + + - name: Refresh inventory so the new container is reachable + ansible.builtin.meta: refresh_inventory + +- name: Apply common baseline + hosts: nginx_cache + become: true + gather_facts: true + roles: + - common + +- name: Install + configure the edge cache + hosts: nginx_cache + become: true + gather_facts: true + roles: + - nginx_proxy_cache diff --git a/infra/ansible/roles/nginx_proxy_cache/README.md b/infra/ansible/roles/nginx_proxy_cache/README.md new file mode 100644 index 000000000..b79fcb0e6 --- /dev/null +++ b/infra/ansible/roles/nginx_proxy_cache/README.md @@ -0,0 +1,114 @@ +# `nginx_proxy_cache` role — phase-1 self-hosted edge cache + +Sits on its own Incus container `nginx-cache` between clients and the distributed MinIO cluster. Caches HLS segments aggressively (1 MiB slice, 7 d TTL) and HLS playlists conservatively (60 s TTL). Disk-backed, capped at 20 GB, stale-on-error covered. + +This is the **phase-1 alternative to a third-party CDN**. It costs nothing in egress, leaks no logs to a third party, and handles thousands of concurrent listeners on a single R720. Phase-2 (W3+) adds a second geographically-distinct cache node + GeoDNS ; phase-3 only if traffic justifies a third-party CDN (Bunny.net is wired in `cdn_service.go` and stays inert until `CDN_ENABLED=true`). + +## Topology + +``` + :80 + │ + ┌──────▼─────────┐ + │ nginx-cache │ proxy_cache_path /var/cache/nginx/veza + │ (this role) │ 20 GB disk, 1 MiB slices, 7 d TTL + └──────┬─────────┘ + │ keepalive ×32 backend pool + ┌────────┴────────────────┐ + ▼ ▼ ▼ ▼ + minio-1.lxd minio-2.lxd minio-3.lxd minio-4.lxd + (EC:2 distributed cluster) +``` + +When `CDN_ENABLED=false` (the default), `TrackService.GetStorageURL` returns `http://minio-1.lxd:9000/...` presigned URLs directly. To route through this cache layer, point the backend at the cache instead : + +```env +AWS_S3_ENDPOINT=http://nginx-cache.lxd:80 +``` + +The cache forwards to MinIO ; signed URLs still work because the signature lives in the query string (we cache `$args` in the key but signatures are short-lived so cache effectiveness only matters per signed-URL window). + +## Defaults + +| variable | default | meaning | +| --------------------------------- | ------------------------ | ------------------------------------------------------ | +| `nginx_cache_root` | `/var/cache/nginx/veza` | disk-backed cache root | +| `nginx_cache_max_size` | `20g` | hard cap on the cache directory | +| `nginx_cache_inactive` | `7d` | purge entries unused for > 7 d | +| `nginx_cache_ttl_segment` | `7d` | TTL for `.ts` / `.m4s` / `.mp4` / `.aac` / `.m4a` | +| `nginx_cache_ttl_playlist` | `60s` | TTL for `.m3u8` | +| `nginx_cache_ttl_other` | `1h` | TTL for everything else (cover art, originals) | +| `nginx_cache_stale_error_window` | `1h` | serve stale on origin 5xx / timeout for this window | +| `nginx_cache_listen_port` | `80` | listener (HTTP). TLS lives at the public LB. | +| `nginx_cache_minio_port` | `9000` | MinIO upstream port | + +## Cache-key policy + +``` +"$scheme$request_method$host$uri$is_args$args" + $slice_range (segments only) +``` + +- **Authorization / Cookie not in the key.** All access in v1.0 goes through presigned URLs (signature in `$args`) so per-user state is naturally segmented by query string. Adding cookies/auth would either explode cardinality or, worse, leak per-user objects across users. +- **`$slice_range`** : 1 MiB slices. A range request for `bytes=0-512000` is served from the same cached chunks as `bytes=300000-700000` ; cache effectiveness stays high even when clients pick odd byte windows. + +## Verifying it works + +```bash +# Curl the same URL twice through the cache. First should be MISS, +# second should be HIT. The X-Cache-Status header surfaces the verdict. +curl -sI http://nginx-cache.lxd/veza-prod-tracks//master.m3u8 | grep -i x-cache +# x-cache-status: MISS +curl -sI http://nginx-cache.lxd/veza-prod-tracks//master.m3u8 | grep -i x-cache +# x-cache-status: HIT +``` + +The smoke test `infra/ansible/tests/test_nginx_cache.sh` automates this check. + +## Operations + +```bash +# Disk usage of the cache directory : +sudo du -sh /var/cache/nginx/veza + +# Tail access logs (shows HIT/MISS/STALE per request) : +sudo tail -f /var/log/nginx/veza-cache.access.log + +# Reload after changing TTLs without dropping in-flight requests : +sudo systemctl reload nginx + +# Bust the entire cache : +sudo systemctl stop nginx +sudo rm -rf /var/cache/nginx/veza/* +sudo systemctl start nginx + +# Per-key purge requires ngx_cache_purge or nginx-plus — not in v1.0. +# Workaround : delete the file from disk by computing the md5 of the +# cache key and touching the corresponding directory under +# /var/cache/nginx/veza///<...>. + +# Stub-status (Prometheus exporter target) — bound to loopback only : +curl -s http://127.0.0.1:81/__nginx_status +# Active connections: 4 +# server accepts handled requests +# 12345 12345 67890 +# Reading: 0 Writing: 1 Waiting: 3 +``` + +## Hit-rate dashboard + +The access log carries `cache=$upstream_cache_status`. Point a Promtail (or vector) instance at `/var/log/nginx/veza-cache.access.log` and group by `cache` for a hit-ratio panel. Until that's wired, a quick command : + +```bash +sudo awk '{print $NF}' /var/log/nginx/veza-cache.access.log \ + | grep -oP 'cache=\K\w+' | sort | uniq -c | sort -rn +# 18432 cache=HIT +# 1284 cache=MISS +# 16 cache=EXPIRED +``` + +## What this role does NOT cover + +- **TLS termination.** The Incus bridge is the trust boundary in v1.0. Public exposure goes through the existing HAProxy/Caddy LB which does TLS upstream of this cache. When phase-2 puts the cache directly on the public internet, switch `nginx_cache_listen_port` to 443 and add `tls_cert_path` / `tls_key_path` defaults. +- **Per-key purge.** OSS Nginx has no native purge ; v1.1 adds either ngx_cache_purge (compiled-in module) or migrates to Varnish. +- **Multi-node coordination.** Single cache node in phase-1. Phase-2 introduces a second node + GeoDNS — independent caches are fine because HLS segments are immutable. +- **Brotli.** Audio is already compressed ; gzip is enabled for `.m3u8` only. Brotli would add CPU for marginal gains. diff --git a/infra/ansible/roles/nginx_proxy_cache/defaults/main.yml b/infra/ansible/roles/nginx_proxy_cache/defaults/main.yml new file mode 100644 index 000000000..20c47a29a --- /dev/null +++ b/infra/ansible/roles/nginx_proxy_cache/defaults/main.yml @@ -0,0 +1,52 @@ +# nginx_proxy_cache defaults — phase-1 edge cache (self-hosted) in +# front of the distributed MinIO cluster. +# +# Why Nginx and not Varnish : VCL is overkill for HLS in front of S3. +# Segments are content-addressed (immutable), playlists rotate every +# 60s ; a plain HTTP cache with proper Cache-Control fences is +# sufficient. Nginx integrates trivially with TLS, structured logs, +# and the existing Prometheus stack via stub_status. +# +# Phase-1 scope : single cache node colocated on the R720 host +# (Incus container `nginx-cache`). Phase-2 (W3+) adds a second +# geographically-distinct cache node + GeoDNS ; phase-3 only if the +# auto-hosted edges aren't enough. +--- +nginx_cache_root: /var/cache/nginx/veza +nginx_cache_max_size: "20g" # disk cap. R720 has plenty of space. +nginx_cache_inactive: "7d" # purge entries unused for > 7d +nginx_cache_levels: "1:2" # 16 × 256 dir fan-out, plenty for 100k objects + +# Origin pool — points at the MinIO cluster. The role reads +# groups['minio_nodes'] inventory to populate the upstream block +# automatically ; override here if testing against an external bucket. +nginx_cache_minio_port: 9000 + +# Cache TTLs by file extension. Segments are content-addressed +# (immutable) so 7 days is safe + matches the backend's +# Cache-Control: max-age=86400, immutable header (we add the upper +# bound here on top, the backend can't reach above the origin's TTL). +nginx_cache_ttl_segment: "7d" # .ts, .m4s, .mp4, .aac +nginx_cache_ttl_playlist: "60s" # .m3u8 (live streams may regen) +nginx_cache_ttl_other: "1h" # cover art, generic objects + +# Stale-on-error : if the origin times out / 5xx, serve the stale +# cached version. Bounded so we don't lock viewers into a permanently +# stale view if MinIO is genuinely gone. +nginx_cache_stale_error_window: "1h" + +# Listener config. v1.0 = HTTP only on the Incus bridge ; TLS +# termination lives at the public LB (HAProxy/Caddy in prod). When +# we add direct internet exposure (phase-2), tls_cert_path / +# tls_key_path go here. +nginx_cache_listen_port: 80 +nginx_cache_server_name: "cache.veza.lxd" + +# Worker tuning. nginx defaults are ~1 worker per core ; stub_status +# exporter parses these so set them explicitly for graphability. +nginx_cache_worker_processes: "auto" +nginx_cache_worker_connections: 4096 + +# Stub-status endpoint for the prometheus nginx exporter. Bound to +# loopback only — the exporter sidecar reads it via 127.0.0.1. +nginx_cache_stub_status_path: "/__nginx_status" diff --git a/infra/ansible/roles/nginx_proxy_cache/handlers/main.yml b/infra/ansible/roles/nginx_proxy_cache/handlers/main.yml new file mode 100644 index 000000000..7419154d4 --- /dev/null +++ b/infra/ansible/roles/nginx_proxy_cache/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Reload nginx + ansible.builtin.systemd: + name: nginx + state: reloaded diff --git a/infra/ansible/roles/nginx_proxy_cache/tasks/main.yml b/infra/ansible/roles/nginx_proxy_cache/tasks/main.yml new file mode 100644 index 000000000..b50342fae --- /dev/null +++ b/infra/ansible/roles/nginx_proxy_cache/tasks/main.yml @@ -0,0 +1,71 @@ +# nginx_proxy_cache role — installs nginx, renders the cache config +# pointed at the MinIO cluster, ensures the systemd unit running. +# Idempotent. +--- +- name: Install nginx + curl (curl needed for the smoke test) + ansible.builtin.apt: + name: + - nginx + - curl + state: present + update_cache: true + cache_valid_time: 3600 + tags: [nginx_cache, packages] + +- name: Ensure cache root directory + ansible.builtin.file: + path: "{{ nginx_cache_root }}" + state: directory + owner: www-data + group: www-data + mode: "0755" + tags: [nginx_cache, config] + +- name: Render nginx.conf (top-level) + ansible.builtin.template: + src: nginx.conf.j2 + dest: /etc/nginx/nginx.conf + owner: root + group: root + mode: "0644" + notify: Reload nginx + tags: [nginx_cache, config] + +- name: Disable the default site + ansible.builtin.file: + path: /etc/nginx/sites-enabled/default + state: absent + notify: Reload nginx + tags: [nginx_cache, config] + +- name: Render the veza-cache site config + ansible.builtin.template: + src: sites/veza-cache.conf.j2 + dest: /etc/nginx/sites-available/veza-cache.conf + owner: root + group: root + mode: "0644" + notify: Reload nginx + tags: [nginx_cache, config] + +- name: Enable veza-cache site + ansible.builtin.file: + src: /etc/nginx/sites-available/veza-cache.conf + dest: /etc/nginx/sites-enabled/veza-cache.conf + state: link + force: true + notify: Reload nginx + tags: [nginx_cache, config] + +- name: Validate nginx config + ansible.builtin.command: + cmd: nginx -t + changed_when: false + tags: [nginx_cache, config] + +- name: Enable + start nginx + ansible.builtin.systemd: + name: nginx + state: started + enabled: true + tags: [nginx_cache, service] diff --git a/infra/ansible/roles/nginx_proxy_cache/templates/nginx.conf.j2 b/infra/ansible/roles/nginx_proxy_cache/templates/nginx.conf.j2 new file mode 100644 index 000000000..96e6e2bc4 --- /dev/null +++ b/infra/ansible/roles/nginx_proxy_cache/templates/nginx.conf.j2 @@ -0,0 +1,87 @@ +# Managed by Ansible — do not edit by hand. +# Top-level nginx config tuned for proxy_cache duty in front of MinIO. +# Site-specific config lives in sites-enabled/veza-cache.conf. + +user www-data; +worker_processes {{ nginx_cache_worker_processes }}; +worker_rlimit_nofile 65535; +pid /run/nginx.pid; + +events { + worker_connections {{ nginx_cache_worker_connections }}; + multi_accept on; +} + +http { + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + server_tokens off; + client_max_body_size 0; # streaming proxy — no enforced upload cap + proxy_buffering on; + proxy_request_buffering off; + types_hash_max_size 2048; + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # ---------------------------------------------------------------- + # Cache zones. Single shared zone for all object types ; segments + # and playlists are differentiated by Cache-Control / proxy_cache_valid + # rules in the site config, not by separate zones (saves keys_zone + # memory + simplifies invalidation). + # ---------------------------------------------------------------- + proxy_cache_path {{ nginx_cache_root }} + levels={{ nginx_cache_levels }} + keys_zone=veza_cache:128m + max_size={{ nginx_cache_max_size }} + inactive={{ nginx_cache_inactive }} + use_temp_path=off; + + # The cache key. We deliberately exclude Authorization/Cookie + # because these requests are presigned URLs (signature in query + # string, no auth headers). If we ever cache authenticated traffic + # we'd need to add `$http_authorization` here — but never combine + # with `$cookie_session` or we leak per-user objects across users. + proxy_cache_key "$scheme$request_method$host$uri$is_args$args"; + + # Honor stale entries while the origin is unhealthy / updating. + proxy_cache_use_stale error timeout updating + http_500 http_502 http_503 http_504; + proxy_cache_background_update on; + proxy_cache_lock on; + proxy_cache_lock_timeout 5s; + proxy_cache_revalidate on; + + # Pass Range through to MinIO so byte-range requests are honored. + # Range responses are cached as 206 Partial Content, which Nginx + # handles correctly when slice module is enabled (we turn it on + # in the site block when needed). + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Range $http_range; + proxy_set_header If-Range $http_if_range; + + # Logs — compact format with HIT/MISS/STALE so dashboards can + # compute hit ratio without a separate exporter. + log_format veza_cache '$remote_addr - $remote_user [$time_iso8601] ' + '"$request" $status $body_bytes_sent ' + '"$http_referer" "$http_user_agent" ' + 'cache=$upstream_cache_status rt=$request_time ' + 'urt=$upstream_response_time'; + access_log /var/log/nginx/veza-cache.access.log veza_cache; + error_log /var/log/nginx/veza-cache.error.log warn; + + gzip on; + gzip_vary on; + gzip_proxied any; + gzip_types application/vnd.apple.mpegurl text/css application/javascript + application/json text/plain text/xml; + # Audio segments are already compressed (mpeg-ts, fmp4) — don't + # waste CPU re-gzipping them. + + include /etc/nginx/conf.d/*.conf; + include /etc/nginx/sites-enabled/*; +} diff --git a/infra/ansible/roles/nginx_proxy_cache/templates/sites/veza-cache.conf.j2 b/infra/ansible/roles/nginx_proxy_cache/templates/sites/veza-cache.conf.j2 new file mode 100644 index 000000000..546b47a8b --- /dev/null +++ b/infra/ansible/roles/nginx_proxy_cache/templates/sites/veza-cache.conf.j2 @@ -0,0 +1,95 @@ +# Managed by Ansible — do not edit by hand. +# Veza edge cache — proxy_pass to MinIO, cache HLS aggressively. + +# MinIO upstream — round-robin across the EC:2 pool. health_check +# directive (ngx_http_upstream_module) is nginx-plus only ; on +# OSS we rely on the built-in passive health check (max_fails + +# fail_timeout) below. +upstream veza_minio { +{% for host in groups['minio_nodes'] | default(['minio-1', 'minio-2', 'minio-3', 'minio-4']) %} + server {{ host }}.lxd:{{ nginx_cache_minio_port }} max_fails=3 fail_timeout=10s; +{% endfor %} + keepalive 32; +} + +# Internal stub_status endpoint for the prometheus exporter. +server { + listen 127.0.0.1:81; + server_name localhost; + + location {{ nginx_cache_stub_status_path }} { + stub_status; + access_log off; + allow 127.0.0.1; + deny all; + } +} + +server { + listen {{ nginx_cache_listen_port }} default_server; + server_name {{ nginx_cache_server_name }}; + + # Surface the cache verdict on every response so smoke tests + + # operators can verify HIT/MISS without parsing access logs. + add_header X-Cache-Status $upstream_cache_status always; + + # Health probe — bypasses the cache entirely so monitors see the + # nginx instance liveness, not a stale cache. + location = /health { + access_log off; + return 200 "ok\n"; + add_header Content-Type text/plain; + } + + # ---------------------------------------------------------------- + # HLS segments — content-addressed (filename includes a hash) so + # we cache aggressively. 7 days is the upper bound ; backend + # already sends max-age=86400 immutable, the cache TTL above + # extends it because origin Cache-Control is overridden by + # proxy_cache_valid when present. + # ---------------------------------------------------------------- + location ~* \.(ts|m4s|mp4|aac|m4a)$ { + proxy_pass http://veza_minio; + proxy_cache veza_cache; + proxy_cache_valid 200 206 {{ nginx_cache_ttl_segment }}; + proxy_cache_valid 404 1m; # negative cache for typo'd URLs + proxy_ignore_headers Cache-Control Expires Set-Cookie; + proxy_http_version 1.1; + proxy_set_header Connection ""; + + # 1 MiB slice — Nginx fetches in 1MiB chunks even on small + # range requests, dramatically improving hit ratio for byte + # ranges. Standard for HLS proxy_cache deployments. + slice 1m; + proxy_set_header Range $slice_range; + proxy_cache_key "$scheme$request_method$host$uri$is_args$args$slice_range"; + } + + # ---------------------------------------------------------------- + # HLS playlists — short TTL because live streams may regenerate. + # 60s matches the backend's Cache-Control on .m3u8 responses. + # ---------------------------------------------------------------- + location ~* \.m3u8$ { + proxy_pass http://veza_minio; + proxy_cache veza_cache; + proxy_cache_valid 200 {{ nginx_cache_ttl_playlist }}; + proxy_cache_valid 404 10s; + proxy_http_version 1.1; + proxy_set_header Connection ""; + } + + # ---------------------------------------------------------------- + # Catch-all : cover art, original audio file downloads, etc. + # Shorter TTL than segments because these can be replaced + # in-place (track edits) ; the user is expected to bust them + # via PUT to the same key. + # ---------------------------------------------------------------- + location / { + proxy_pass http://veza_minio; + proxy_cache veza_cache; + proxy_cache_valid 200 206 {{ nginx_cache_ttl_other }}; + proxy_cache_valid 404 1m; + proxy_http_version 1.1; + proxy_set_header Connection ""; + } +} diff --git a/infra/ansible/tests/test_nginx_cache.sh b/infra/ansible/tests/test_nginx_cache.sh new file mode 100755 index 000000000..f7783d8c8 --- /dev/null +++ b/infra/ansible/tests/test_nginx_cache.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# test_nginx_cache.sh — verify the phase-1 edge cache actually caches. +# +# Sequence : +# 1. Pre-flight : nginx-cache reachable + /health returns 200. +# 2. Curl a fixed URL twice. First request must be MISS, second +# must be HIT. Surfaces via the X-Cache-Status response header. +# 3. Verify the cache directory has at least one entry on disk. +# +# Run on the Incus host that owns the nginx-cache container, OR +# from a workstation that can reach nginx-cache.lxd:80. The test +# uses a public bucket path so it doesn't need MinIO credentials. +# +# v1.0.9 — phase-1 edge cache acceptance. +# +# Usage: +# TEST_URL=http://nginx-cache.lxd/some-public-object.m4s \ +# bash infra/ansible/tests/test_nginx_cache.sh +# +# Exit codes : +# 0 — cache is working (MISS then HIT) +# 1 — cache not reachable +# 2 — second request did not return HIT +# 3 — required tool missing +set -euo pipefail + +CACHE_HOST=${CACHE_HOST:-nginx-cache.lxd} +CACHE_PORT=${CACHE_PORT:-80} +TEST_URL=${TEST_URL:-http://${CACHE_HOST}:${CACHE_PORT}/health} +RUN_VIA_INCUS=${RUN_VIA_INCUS:-0} + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +fail() { log "FAIL: $*"; exit "${2:-2}"; } + +require() { + command -v "$1" >/dev/null 2>&1 || fail "required tool missing on host: $1" 3 +} + +require curl +require date + +# helper : run curl in the right context. If RUN_VIA_INCUS=1 the +# command runs inside the host (which can resolve .lxd hostnames). +do_curl() { + if [ "$RUN_VIA_INCUS" = "1" ]; then + incus exec nginx-cache -- curl -sS "$@" + else + curl -sS "$@" + fi +} + +# ----------------------------------------------------------------------------- +# 1. Pre-flight — /health is alive. +# ----------------------------------------------------------------------------- +log "step 0: pre-flight — GET ${CACHE_HOST}:${CACHE_PORT}/health" +health_url="http://${CACHE_HOST}:${CACHE_PORT}/health" +if ! body=$(do_curl --max-time 5 -o /dev/null -w "%{http_code}" "$health_url" 2>&1); then + fail "cache not reachable at $health_url" 1 +fi +if [ "$body" != "200" ]; then + fail "cache /health returned HTTP $body, want 200" 1 +fi + +# ----------------------------------------------------------------------------- +# 2. The ${TEST_URL} dance — first MISS, second HIT. +# ----------------------------------------------------------------------------- +log "step 1: first request to $TEST_URL — expect MISS" +status1=$(do_curl --max-time 10 -o /dev/null -D - "$TEST_URL" 2>&1 \ + | tr -d '\r' | awk -F': ' 'tolower($1) == "x-cache-status" {print $2}' | tr -d ' ') +log " X-Cache-Status: $status1" +if [ -z "$status1" ]; then + fail "first response missing X-Cache-Status header — site config not applying ?" 2 +fi + +# A cacheable URL with no prior entry is MISS or EXPIRED. STALE means +# the upstream is unhealthy, which is a different bug — fail loud. +case "$status1" in + MISS|EXPIRED|REVALIDATED|BYPASS) ;; + HIT) log " WARN: first request already HIT (a previous run cached it)" ;; + STALE|UPDATING) fail "origin upstream unhealthy — got STALE on cold cache" 2 ;; + *) fail "unexpected cache status on first request: $status1" 2 ;; +esac + +log "step 2: second request to $TEST_URL — expect HIT" +sleep 1 # give nginx a moment to flush the entry to disk +status2=$(do_curl --max-time 10 -o /dev/null -D - "$TEST_URL" 2>&1 \ + | tr -d '\r' | awk -F': ' 'tolower($1) == "x-cache-status" {print $2}' | tr -d ' ') +log " X-Cache-Status: $status2" + +if [ "$status2" != "HIT" ]; then + fail "second request returned X-Cache-Status=$status2, want HIT — cache config not effective" 2 +fi + +# ----------------------------------------------------------------------------- +# 3. On-disk verification — at least one entry under the cache root. +# ----------------------------------------------------------------------------- +log "step 3: verifying cache directory has on-disk entries" +if [ "$RUN_VIA_INCUS" = "1" ]; then + count=$(incus exec nginx-cache -- bash -c "find /var/cache/nginx/veza -type f 2>/dev/null | wc -l" || echo 0) + log " on-disk entries: $count" + if [ "$count" = "0" ]; then + fail "cache directory is empty after a HIT — unexpected" 2 + fi +fi + +log "PASS: edge cache MISS→HIT roundtrip OK (status1=$status1, status2=$status2)" +exit 0