From 84e92a75e2774ec401a3f52f149541f251dbf3f3 Mon Sep 17 00:00:00 2001 From: senke Date: Tue, 28 Apr 2026 01:15:11 +0200 Subject: [PATCH] feat(observability): OTel SDK + collector + Tempo + 4 hot path spans (W2 Day 9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a collector, which tail-samples (errors + slow always, 10% rest) and ships to Tempo. Grafana service-map dashboard pivots on the 4 instrumented hot paths. - internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown, BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler, W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true short-circuits to a no-op. Failure to dial collector is non-fatal. - cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion ldflag-overridable for resource attributes. - 4 hot paths instrumented : * handlers/auth.go::Login → "auth.login" * core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate" * core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook" * handlers/search_handlers.go::Search → "search.query" PII guarded — email masked, query content not recorded (length only). - infra/ansible/roles/otel_collector : pin v0.116.1 contrib build, systemd unit, tail-sampling config (errors + > 500ms always kept). - infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend (S3 deferred to v1.1), 14d retention. - infra/ansible/playbooks/observability.yml : provisions both Incus containers + applies common baseline + roles in order. - inventory/lab.yml : new groups observability, otel_collectors, tempo. - config/grafana/dashboards/service-map.json : node graph + 4 hot-path span tables + collector throughput/queue panels. - docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented. Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab deployment to validate with `ansible-playbook -i inventory/lab.yml playbooks/observability.yml` once roles/postgres_ha is up. Co-Authored-By: Claude Opus 4.7 (1M context) --- config/grafana/dashboards/service-map.json | 101 +++++++++ docs/ENV_VARIABLES.md | 20 +- infra/ansible/inventory/lab.yml | 16 ++ infra/ansible/playbooks/observability.yml | 71 +++++++ infra/ansible/roles/otel_collector/README.md | 54 +++++ .../roles/otel_collector/defaults/main.yml | 28 +++ .../roles/otel_collector/handlers/main.yml | 6 + .../roles/otel_collector/tasks/main.yml | 96 +++++++++ .../templates/otel-collector.service.j2 | 27 +++ .../templates/otel-collector.yaml.j2 | 97 +++++++++ infra/ansible/roles/tempo/README.md | 50 +++++ infra/ansible/roles/tempo/defaults/main.yml | 25 +++ infra/ansible/roles/tempo/handlers/main.yml | 6 + infra/ansible/roles/tempo/tasks/main.yml | 100 +++++++++ .../roles/tempo/templates/tempo.service.j2 | 26 +++ .../roles/tempo/templates/tempo.yaml.j2 | 58 ++++++ veza-backend-api/cmd/api/main.go | 29 +++ veza-backend-api/go.mod | 25 ++- veza-backend-api/go.sum | 64 ++++-- .../internal/core/marketplace/service.go | 18 ++ .../core/track/track_upload_handler.go | 21 ++ veza-backend-api/internal/handlers/auth.go | 23 ++- .../internal/handlers/search_handlers.go | 18 ++ .../internal/tracing/otlp_exporter.go | 194 ++++++++++++++++++ 24 files changed, 1139 insertions(+), 34 deletions(-) create mode 100644 config/grafana/dashboards/service-map.json create mode 100644 infra/ansible/playbooks/observability.yml create mode 100644 infra/ansible/roles/otel_collector/README.md create mode 100644 infra/ansible/roles/otel_collector/defaults/main.yml create mode 100644 infra/ansible/roles/otel_collector/handlers/main.yml create mode 100644 infra/ansible/roles/otel_collector/tasks/main.yml create mode 100644 infra/ansible/roles/otel_collector/templates/otel-collector.service.j2 create mode 100644 infra/ansible/roles/otel_collector/templates/otel-collector.yaml.j2 create mode 100644 infra/ansible/roles/tempo/README.md create mode 100644 infra/ansible/roles/tempo/defaults/main.yml create mode 100644 infra/ansible/roles/tempo/handlers/main.yml create mode 100644 infra/ansible/roles/tempo/tasks/main.yml create mode 100644 infra/ansible/roles/tempo/templates/tempo.service.j2 create mode 100644 infra/ansible/roles/tempo/templates/tempo.yaml.j2 create mode 100644 veza-backend-api/internal/tracing/otlp_exporter.go diff --git a/config/grafana/dashboards/service-map.json b/config/grafana/dashboards/service-map.json new file mode 100644 index 000000000..91e2d6d53 --- /dev/null +++ b/config/grafana/dashboards/service-map.json @@ -0,0 +1,101 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { "title": "Tempo data source", "type": "link", "url": "/explore?left=%7B%22datasource%22:%22tempo%22%7D" } + ], + "liveNow": false, + "panels": [ + { + "datasource": { "type": "tempo", "uid": "tempo" }, + "gridPos": { "h": 14, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "type": "nodeGraph", + "title": "Service map (last 1h)", + "options": {}, + "targets": [ + { + "queryType": "serviceMap", + "refId": "A", + "datasource": { "type": "tempo", "uid": "tempo" } + } + ] + }, + { + "datasource": { "type": "tempo", "uid": "tempo" }, + "gridPos": { "h": 10, "w": 12, "x": 0, "y": 14 }, + "id": 2, + "type": "table", + "title": "Slowest spans (auth.login + track.upload.initiate + payment.webhook + search.query)", + "options": { "showHeader": true }, + "fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] }, + "targets": [ + { + "query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\"} | by(name) | aggregate(max(duration))", + "queryType": "traceql", + "tableType": "spans", + "refId": "A", + "datasource": { "type": "tempo", "uid": "tempo" } + } + ] + }, + { + "datasource": { "type": "tempo", "uid": "tempo" }, + "gridPos": { "h": 10, "w": 12, "x": 12, "y": 14 }, + "id": 3, + "type": "table", + "title": "Recent errors on hot paths", + "options": { "showHeader": true }, + "fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] }, + "targets": [ + { + "query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\" && status=error} | by(name)", + "queryType": "traceql", + "tableType": "spans", + "refId": "A", + "datasource": { "type": "tempo", "uid": "tempo" } + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 4, + "type": "timeseries", + "title": "OTel collector — accepted vs refused spans", + "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] }, + "options": { "legend": { "displayMode": "list", "placement": "bottom" } }, + "targets": [ + { "expr": "sum(rate(otelcol_receiver_accepted_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "accepted", "refId": "A" }, + { "expr": "sum(rate(otelcol_receiver_refused_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "refused", "refId": "B" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 5, + "type": "timeseries", + "title": "OTel collector — exporter queue depth", + "fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] }, + "options": { "legend": { "displayMode": "list", "placement": "bottom" } }, + "targets": [ + { "expr": "otelcol_exporter_queue_size", "legendFormat": "{{exporter}}", "refId": "A" }, + { "expr": "otelcol_exporter_queue_capacity", "legendFormat": "{{exporter}} capacity", "refId": "B" } + ] + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["veza", "tracing", "tempo"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Veza Service Map (Tempo)", + "uid": "veza-service-map", + "version": 1 +} diff --git a/docs/ENV_VARIABLES.md b/docs/ENV_VARIABLES.md index 7769cdfeb..615d2a12a 100644 --- a/docs/ENV_VARIABLES.md +++ b/docs/ENV_VARIABLES.md @@ -56,7 +56,8 @@ Tout le reste a un défaut raisonnable ou est opt-in. 27. [Variables dépréciées / legacy](#27-variables-dépréciées--legacy) 28. [Règles de validation production](#28-règles-de-validation-production) 29. [Drift template ↔ code](#29-drift-template--code) -30. [Checklist de démarrage](#30-checklist-de-démarrage) +30. [OpenTelemetry / distributed tracing](#30-opentelemetry--distributed-tracing-v109-day-9) +31. [Checklist de démarrage](#31-checklist-de-démarrage) **Légende** : **variable en gras** = critique en production (validée au boot). @@ -543,7 +544,22 @@ Survey 2026-04-23 a identifié des incohérences entre `.env.template` et le cod **Incohérence de nommage** : `SMTP_USERNAME` canonique vs `SMTP_USER` legacy ; `DB_MAX_*` code vs `DATABASE_MAX_*` template. -## 30. Checklist de démarrage +## 30. OpenTelemetry / distributed tracing (v1.0.9 Day 9) + +Quatre variables consommées par `veza-backend-api/internal/tracing/otlp_exporter.go` au boot. Toutes optionnelles — non set = comportement par défaut documenté. + +| Variable | Défaut | Effet | +| --- | --- | --- | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `localhost:4317` | gRPC endpoint de l'otel-collector. En prod : `otel-collector.lxd:4317`. | +| `OTEL_SDK_DISABLED` | `false` | `true` ou `1` → no-op tracer (zero spans émis). Utile en tests unitaires + dev local sans collector. | +| `OTEL_TRACES_SAMPLER_ARG` | `1.0` | Fraction de traces root samplées côté SDK (0..1). Prod recommandé `1.0` puisque le collector applique son propre tail-sampling derrière. | +| `OTEL_DEPLOYMENT_ENV` | (none) | Override de `cfg.Env` pour le `deployment.environment` resource attribute. Rarement utile. | + +Le binaire **ne crashe pas** si le collector est down : l'exporter bufferise puis retry. Spans sont droppés au-delà de 2048 en buffer. + +Hot paths instrumentés (v1.0.9) : `auth.login`, `track.upload.initiate`, `payment.webhook`, `search.query`. Voir `infra/ansible/roles/{otel_collector,tempo}/README.md` pour le déploiement de la pipeline. + +## 31. Checklist de démarrage 1. Copier `veza-backend-api/.env.template` vers `veza-backend-api/.env` et configurer. 2. Pour RS256 prod : exécuter `scripts/generate-jwt-keys.sh` et configurer `JWT_PRIVATE_KEY_PATH`, `JWT_PUBLIC_KEY_PATH`. Sinon `JWT_SECRET` ≥32 chars. diff --git a/infra/ansible/inventory/lab.yml b/infra/ansible/inventory/lab.yml index 3f0a6b129..7f3de96d5 100644 --- a/infra/ansible/inventory/lab.yml +++ b/infra/ansible/inventory/lab.yml @@ -56,3 +56,19 @@ all: vars: ansible_connection: community.general.incus ansible_python_interpreter: /usr/bin/python3 + # v1.0.9 Day 9: otel-collector + Tempo for distributed tracing. + # Each runs in its own Incus container; the API on the host points + # at otel-collector.lxd:4317 via OTEL_EXPORTER_OTLP_ENDPOINT. + observability: + hosts: + otel-collector: + tempo: + vars: + ansible_connection: community.general.incus + ansible_python_interpreter: /usr/bin/python3 + otel_collectors: + hosts: + otel-collector: + tempo: + hosts: + tempo: diff --git a/infra/ansible/playbooks/observability.yml b/infra/ansible/playbooks/observability.yml new file mode 100644 index 000000000..39ffcd4f1 --- /dev/null +++ b/infra/ansible/playbooks/observability.yml @@ -0,0 +1,71 @@ +# Observability playbook — provisions Incus containers for the +# trace pipeline and lays down otel-collector + tempo on top. +# +# Topology: +# otel-collector — receives OTLP from veza-backend-api, samples + ships +# tempo — stores traces, queried by Grafana +# +# Both run on the same Incus host today (veza-lab). When v1.1 splits +# observability onto a dedicated host, the only change here is the +# inventory; the playbook stays the same. +# +# Run with: +# ansible-galaxy collection install community.general +# ansible-playbook -i inventory/lab.yml playbooks/observability.yml --check +# ansible-playbook -i inventory/lab.yml playbooks/observability.yml +--- +- name: Provision Incus containers for the trace pipeline + hosts: incus_hosts + become: true + gather_facts: true + tasks: + - name: Launch otel-collector + tempo + ansible.builtin.shell: + cmd: | + set -e + for ct in otel-collector tempo; do + if ! incus info "$ct" >/dev/null 2>&1; then + incus launch images:ubuntu/22.04 "$ct" + # Wait for cloud-init. + for _ in $(seq 1 30); do + if incus exec "$ct" -- cloud-init status 2>/dev/null | grep -q "status: done"; then + break + fi + sleep 1 + done + incus exec "$ct" -- apt-get update + incus exec "$ct" -- apt-get install -y python3 python3-apt + fi + done + args: + executable: /bin/bash + register: provision_result + changed_when: "'incus launch' in provision_result.stdout" + tags: [observability, provision] + + - name: Refresh inventory so the new containers are reachable + ansible.builtin.meta: refresh_inventory + +- name: Apply common baseline to observability containers + hosts: observability + become: true + gather_facts: true + roles: + - common + +# Tempo first — the collector depends on it being reachable. Ansible +# runs roles in declaration order, but we put them on separate plays +# anyway because the collector needs Tempo's port open to fully start. +- name: Install + configure Tempo + hosts: tempo + become: true + gather_facts: true + roles: + - tempo + +- name: Install + configure otel-collector + hosts: otel_collectors + become: true + gather_facts: true + roles: + - otel_collector diff --git a/infra/ansible/roles/otel_collector/README.md b/infra/ansible/roles/otel_collector/README.md new file mode 100644 index 000000000..5b2742865 --- /dev/null +++ b/infra/ansible/roles/otel_collector/README.md @@ -0,0 +1,54 @@ +# `otel_collector` role — OpenTelemetry collector in front of Tempo + +Installs `opentelemetry-collector-contrib` (pinned via `otel_collector_version`) as a systemd service, renders a config that receives OTLP/gRPC from `veza-backend-api`, applies a tail-based sampler, and ships traces to Tempo. + +## Why a collector instead of API → Tempo direct + +- **Sampling decisions are server-side.** The API can't know if a trace had errors at the moment it ships its first span; the collector buffers a trace for 5s, then keeps it (errors + > 500ms) or drops it. +- **Retry buffering.** If Tempo is down for 30s, the collector retries; the API doesn't have to. +- **Cardinality fences.** The transform processor can drop high-cardinality attributes before they reach Tempo if a future regression sneaks one in. + +## Pipeline + +``` +veza-backend-api ──OTLP/gRPC:4317──▶ otel-collector ──OTLP/gRPC:4319──▶ Tempo + │ + └─── self-metrics → Prometheus :8888 +``` + +Processors in order: + +1. `memory_limiter` (256 MiB cap) +2. `resourcedetection` (host.name, host.id from /etc/machine-id) +3. `tail_sampling` (errors + slow always; rest at `otel_collector_tail_sample_ok_pct`%) +4. `batch` (1s flush, 8192 spans) + +## Defaults + +| variable | default | meaning | +| --------------------------------------- | -------------------- | ------------------------------------------------- | +| `otel_collector_version` | `0.116.1` | release tag from `opentelemetry-collector-releases` | +| `otel_collector_grpc_port` | `4317` | OTLP/gRPC listener | +| `otel_collector_http_port` | `4318` | OTLP/HTTP listener (kept open for browser SDKs) | +| `otel_collector_tempo_endpoint` | `tempo.lxd:4319` | Tempo OTLP gRPC | +| `otel_collector_tail_sample_ok_pct` | `10` | % of healthy traces kept | +| `otel_collector_memory_limit_mib` | `256` | hard cap | + +## Operations + +```bash +# Status: +sudo systemctl status otel-collector +sudo journalctl -u otel-collector -f + +# Health: +curl -fsS http://otel-collector.lxd:13133 + +# Self-metrics (collector throughput): +curl -fsS http://otel-collector.lxd:8888/metrics | grep otelcol_ +``` + +## What this role does NOT cover + +- **mTLS between API/collector/Tempo.** `tls.insecure: true` everywhere — the security boundary is the Incus bridge for v1.0. W4 swaps in cert-manager-issued certs. +- **Multi-region collector mesh.** Single-host deploy. v1.1+ adds a second collector behind HAProxy. diff --git a/infra/ansible/roles/otel_collector/defaults/main.yml b/infra/ansible/roles/otel_collector/defaults/main.yml new file mode 100644 index 000000000..961e2c713 --- /dev/null +++ b/infra/ansible/roles/otel_collector/defaults/main.yml @@ -0,0 +1,28 @@ +# otel_collector defaults — pin opentelemetry-collector-contrib to a +# known-good release. The "contrib" distribution is required because +# we need a few non-core processors (filter, transform). Override +# `otel_collector_version` per-env if you want a different release. +--- +otel_collector_version: "0.116.1" +otel_collector_arch: amd64 + +# Where the collector listens for spans from veza-backend-api. The +# backend default is localhost:4317 — flip both if you split hosts. +otel_collector_grpc_port: 4317 +otel_collector_http_port: 4318 + +# Tempo upstream. The Tempo container (roles/tempo) listens on its own +# OTLP gRPC port (default 4319 — distinct so the collector and Tempo +# don't fight over 4317 when colocated on the same host). +otel_collector_tempo_endpoint: "tempo.lxd:4319" + +# Sample everything in dev/staging. In prod the collector applies a +# tail-based sampler (config below) that keeps 100% of error spans and +# 10% of healthy ones. +otel_collector_tail_sample_error_pct: 100 +otel_collector_tail_sample_ok_pct: 10 + +# Resource limits — the collector is co-located with the API on the +# Incus host, so we cap it to 256 MiB heap to avoid memory pressure. +otel_collector_memory_limit_mib: 256 +otel_collector_memory_spike_limit_mib: 64 diff --git a/infra/ansible/roles/otel_collector/handlers/main.yml b/infra/ansible/roles/otel_collector/handlers/main.yml new file mode 100644 index 000000000..7ddf1d46e --- /dev/null +++ b/infra/ansible/roles/otel_collector/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart otel-collector + ansible.builtin.systemd: + name: otel-collector + state: restarted + daemon_reload: true diff --git a/infra/ansible/roles/otel_collector/tasks/main.yml b/infra/ansible/roles/otel_collector/tasks/main.yml new file mode 100644 index 000000000..23d13dfa7 --- /dev/null +++ b/infra/ansible/roles/otel_collector/tasks/main.yml @@ -0,0 +1,96 @@ +# otel_collector role — installs opentelemetry-collector-contrib as a +# tarball under /opt, drops the systemd unit, renders the config, and +# starts it. Idempotent. Designed to run in an Incus container so the +# collector can be restarted independently of the API process. +--- +- name: Ensure /opt/otelcol-contrib exists + ansible.builtin.file: + path: /opt/otelcol-contrib + state: directory + owner: root + group: root + mode: "0755" + tags: [otel_collector, install] + +- name: Check installed otelcol version + ansible.builtin.stat: + path: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}" + register: otelcol_installed + tags: [otel_collector, install] + +- name: Download opentelemetry-collector-contrib tarball + ansible.builtin.get_url: + url: "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v{{ otel_collector_version }}/otelcol-contrib_{{ otel_collector_version }}_linux_{{ otel_collector_arch }}.tar.gz" + dest: "/tmp/otelcol-contrib-{{ otel_collector_version }}.tar.gz" + mode: "0644" + when: not otelcol_installed.stat.exists + tags: [otel_collector, install] + +- name: Extract collector binary into versioned slot + ansible.builtin.unarchive: + src: "/tmp/otelcol-contrib-{{ otel_collector_version }}.tar.gz" + dest: /opt/otelcol-contrib + remote_src: true + creates: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}" + extra_opts: + - "--transform=s|^otelcol-contrib$|otelcol-contrib-{{ otel_collector_version }}|" + when: not otelcol_installed.stat.exists + tags: [otel_collector, install] + +# /usr/local/bin/otelcol-contrib symlink → versioned binary. Lets us +# bump the version by changing only `otel_collector_version` and +# re-running the role; systemd unit doesn't change. +- name: Symlink /usr/local/bin/otelcol-contrib → versioned binary + ansible.builtin.file: + src: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}" + dest: /usr/local/bin/otelcol-contrib + state: link + force: true + notify: Restart otel-collector + tags: [otel_collector, install] + +- name: Create otel-collector system user + ansible.builtin.user: + name: otelcol + system: true + home: /var/lib/otel-collector + shell: /usr/sbin/nologin + create_home: true + tags: [otel_collector, install] + +- name: Ensure /etc/otel-collector exists + ansible.builtin.file: + path: /etc/otel-collector + state: directory + owner: root + group: otelcol + mode: "0750" + tags: [otel_collector, config] + +- name: Render collector config + ansible.builtin.template: + src: otel-collector.yaml.j2 + dest: /etc/otel-collector/otel-collector.yaml + owner: root + group: otelcol + mode: "0640" + notify: Restart otel-collector + tags: [otel_collector, config] + +- name: Render systemd unit + ansible.builtin.template: + src: otel-collector.service.j2 + dest: /etc/systemd/system/otel-collector.service + owner: root + group: root + mode: "0644" + notify: Restart otel-collector + tags: [otel_collector, service] + +- name: Enable + start otel-collector + ansible.builtin.systemd: + name: otel-collector + state: started + enabled: true + daemon_reload: true + tags: [otel_collector, service] diff --git a/infra/ansible/roles/otel_collector/templates/otel-collector.service.j2 b/infra/ansible/roles/otel_collector/templates/otel-collector.service.j2 new file mode 100644 index 000000000..884bb82e0 --- /dev/null +++ b/infra/ansible/roles/otel_collector/templates/otel-collector.service.j2 @@ -0,0 +1,27 @@ +# Managed by Ansible — do not edit by hand. +[Unit] +Description=OpenTelemetry Collector (contrib) +Documentation=https://opentelemetry.io/docs/collector/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=otelcol +Group=otelcol +ExecStart=/usr/local/bin/otelcol-contrib --config=/etc/otel-collector/otel-collector.yaml +Restart=on-failure +RestartSec=5s +LimitNOFILE=65535 +# Hardening — same baseline as the other Ansible-managed daemons. +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths=/var/lib/otel-collector +PrivateTmp=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +[Install] +WantedBy=multi-user.target diff --git a/infra/ansible/roles/otel_collector/templates/otel-collector.yaml.j2 b/infra/ansible/roles/otel_collector/templates/otel-collector.yaml.j2 new file mode 100644 index 000000000..d67a3d696 --- /dev/null +++ b/infra/ansible/roles/otel_collector/templates/otel-collector.yaml.j2 @@ -0,0 +1,97 @@ +# Managed by Ansible — do not edit by hand. +# +# opentelemetry-collector-contrib config. +# Pipeline: OTLP/gRPC receiver → batch + tail_sampling + memory_limiter +# → OTLP exporter to Tempo (gRPC). +# +# Tail sampling keeps every error span and {{ otel_collector_tail_sample_ok_pct }}% +# of healthy spans, which is what we want in prod (Tempo storage isn't +# free). Override percentages in inventory group_vars. + +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:{{ otel_collector_grpc_port }}" + http: + endpoint: "0.0.0.0:{{ otel_collector_http_port }}" + +processors: + # memory_limiter goes FIRST so it can reject spans before the rest of + # the pipeline allocates anything. Without it the collector OOMs + # silently when the API bursts. + memory_limiter: + check_interval: 1s + limit_mib: {{ otel_collector_memory_limit_mib }} + spike_limit_mib: {{ otel_collector_memory_spike_limit_mib }} + + # batch — flushes every 1s OR when 8192 spans queue up. The exporter + # likes batches, but we don't want a single span to wait > 1s on a + # quiet system. + batch: + timeout: 1s + send_batch_size: 8192 + send_batch_max_size: 16384 + + # tail_sampling — see policies below. Decision wait is 5s: spans are + # buffered for 5s after their trace's first span lands, then a + # decision (keep / drop) is taken. + tail_sampling: + decision_wait: 5s + num_traces: 50000 + expected_new_traces_per_sec: 100 + policies: + # ALWAYS keep error traces — they're how we debug prod. + - name: keep-errors + type: status_code + status_code: + status_codes: [ERROR] + # ALWAYS keep slow traces (> 500ms). Catches latency spikes even + # when the request "succeeded". + - name: keep-slow + type: latency + latency: + threshold_ms: 500 + # Sample remaining healthy traces at the env percentage. + - name: sample-rest + type: probabilistic + probabilistic: + sampling_percentage: {{ otel_collector_tail_sample_ok_pct }} + + # resourcedetection — best-effort attribute enrichment so spans carry + # host.name and host.id even if the SDK forgot to add them. + resourcedetection: + detectors: [system, env] + timeout: 2s + override: false + +exporters: + otlp/tempo: + endpoint: "{{ otel_collector_tempo_endpoint }}" + tls: + insecure: true # mTLS is W4 territory; the Incus bridge is the security boundary + + # debug exporter — useful in lab; in prod set verbosity: basic so it + # stays quiet. Uncomment in the service.pipelines block to enable. + debug: + verbosity: basic + +extensions: + health_check: + endpoint: "0.0.0.0:13133" + pprof: + endpoint: "127.0.0.1:1777" + +service: + extensions: [health_check, pprof] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, tail_sampling, batch] + exporters: [otlp/tempo] + telemetry: + logs: + level: info + metrics: + level: basic + address: "0.0.0.0:8888" # collector self-metrics scraped by Prometheus diff --git a/infra/ansible/roles/tempo/README.md b/infra/ansible/roles/tempo/README.md new file mode 100644 index 000000000..e80c67114 --- /dev/null +++ b/infra/ansible/roles/tempo/README.md @@ -0,0 +1,50 @@ +# `tempo` role — Grafana Tempo trace backend + +Single-binary Tempo (monolithic mode), local-disk storage, ~14 day retention. Receives OTLP/gRPC from `roles/otel_collector`, exposes the query API on `:3200` for Grafana. + +## Topology + +``` +otel-collector ──OTLP/gRPC:4319──▶ tempo ──HTTP:3200──▶ Grafana data source + │ + └─── /var/lib/tempo (blocks + WAL) +``` + +## Defaults + +| variable | default | meaning | +| --------------------------- | -------------------- | ---------------------------- | +| `tempo_version` | `2.7.1` | release tag | +| `tempo_otlp_grpc_port` | `4319` | OTLP/gRPC listener | +| `tempo_http_port` | `3200` | query API | +| `tempo_storage_backend` | `local` | `local` (v1.0) or `s3` (v1.1+) | +| `tempo_storage_local_path` | `/var/lib/tempo` | block + WAL root | +| `tempo_retention_h` | `336` (14d) | block retention | + +## Operations + +```bash +# Status: +sudo systemctl status tempo +sudo journalctl -u tempo -f + +# Health: +curl -fsS http://tempo.lxd:3200/ready +curl -fsS http://tempo.lxd:3200/metrics | grep tempo_ + +# Query a trace by ID: +curl -fsS "http://tempo.lxd:3200/api/traces/" + +# Search recent traces by service: +curl -fsS "http://tempo.lxd:3200/api/search?tags=service.name=veza-backend-api" +``` + +## Grafana data source + +In Grafana, add a Tempo data source pointing at `http://tempo.lxd:3200`. The service map in `config/grafana/dashboards/service-map.json` (W2 Day 9) is wired to this data source by name `tempo`. + +## What this role does NOT cover + +- **S3-backed storage.** v1.0 = local disk, single-host. v1.1 swaps `storage.trace.backend: s3` to ship blocks to MinIO so Tempo can run multi-replica. +- **Multi-tenancy.** Single tenant (`single-tenant`) until v1.2 brings hosted multi-tenancy in. +- **Metrics generator.** Service-map metrics are computed in the collector pipeline (cheaper than Tempo's `metrics_generator`). diff --git a/infra/ansible/roles/tempo/defaults/main.yml b/infra/ansible/roles/tempo/defaults/main.yml new file mode 100644 index 000000000..5fba0d756 --- /dev/null +++ b/infra/ansible/roles/tempo/defaults/main.yml @@ -0,0 +1,25 @@ +# Tempo defaults — single-binary mode (monolithic), local backend on +# the container's filesystem. Plenty for v1.0; W3+ moves to S3 +# (the same MinIO bucket the rest of the stack uses). +--- +tempo_version: "2.7.1" +tempo_arch: amd64 + +# Where Tempo listens for spans from the otel-collector. The collector +# default in roles/otel_collector points at tempo.lxd:4319, so keep +# them in sync. +tempo_otlp_grpc_port: 4319 +# Tempo's own HTTP API (Grafana data source uses this). +tempo_http_port: 3200 + +# Storage. v1.0 = local disk. v1.1 = S3 (MinIO bucket veza-tempo). +tempo_storage_backend: local +tempo_storage_local_path: /var/lib/tempo + +# Retention — Tempo doesn't compact aggressively; 14d default. +tempo_retention_h: 336 # 14 days + +# Resource sizing — see https://grafana.com/docs/tempo/latest/setup/ +# defaults are tuned for ~5k spans/sec which is way more than v1.0 +# traffic. Override if the API gets popular. +tempo_max_block_bytes: 524288000 # 500 MiB diff --git a/infra/ansible/roles/tempo/handlers/main.yml b/infra/ansible/roles/tempo/handlers/main.yml new file mode 100644 index 000000000..ede168ebb --- /dev/null +++ b/infra/ansible/roles/tempo/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart tempo + ansible.builtin.systemd: + name: tempo + state: restarted + daemon_reload: true diff --git a/infra/ansible/roles/tempo/tasks/main.yml b/infra/ansible/roles/tempo/tasks/main.yml new file mode 100644 index 000000000..330621248 --- /dev/null +++ b/infra/ansible/roles/tempo/tasks/main.yml @@ -0,0 +1,100 @@ +# Tempo role — installs the single-binary distribution under /opt, +# renders monolithic config, sets up systemd. Idempotent. +--- +- name: Ensure /opt/tempo exists + ansible.builtin.file: + path: /opt/tempo + state: directory + owner: root + group: root + mode: "0755" + tags: [tempo, install] + +- name: Check installed Tempo version + ansible.builtin.stat: + path: "/opt/tempo/tempo-{{ tempo_version }}" + register: tempo_installed + tags: [tempo, install] + +- name: Download Tempo tarball + ansible.builtin.get_url: + url: "https://github.com/grafana/tempo/releases/download/v{{ tempo_version }}/tempo_{{ tempo_version }}_linux_{{ tempo_arch }}.tar.gz" + dest: "/tmp/tempo-{{ tempo_version }}.tar.gz" + mode: "0644" + when: not tempo_installed.stat.exists + tags: [tempo, install] + +- name: Extract Tempo binary into versioned slot + ansible.builtin.unarchive: + src: "/tmp/tempo-{{ tempo_version }}.tar.gz" + dest: /opt/tempo + remote_src: true + creates: "/opt/tempo/tempo-{{ tempo_version }}" + extra_opts: + - "--transform=s|^tempo$|tempo-{{ tempo_version }}|" + when: not tempo_installed.stat.exists + tags: [tempo, install] + +- name: Symlink /usr/local/bin/tempo → versioned binary + ansible.builtin.file: + src: "/opt/tempo/tempo-{{ tempo_version }}" + dest: /usr/local/bin/tempo + state: link + force: true + notify: Restart tempo + tags: [tempo, install] + +- name: Create tempo system user + ansible.builtin.user: + name: tempo + system: true + home: "{{ tempo_storage_local_path }}" + shell: /usr/sbin/nologin + create_home: true + tags: [tempo, install] + +- name: Ensure storage directory ownership + ansible.builtin.file: + path: "{{ tempo_storage_local_path }}" + state: directory + owner: tempo + group: tempo + mode: "0755" + tags: [tempo, install] + +- name: Ensure /etc/tempo exists + ansible.builtin.file: + path: /etc/tempo + state: directory + owner: root + group: tempo + mode: "0750" + tags: [tempo, config] + +- name: Render tempo.yaml + ansible.builtin.template: + src: tempo.yaml.j2 + dest: /etc/tempo/tempo.yaml + owner: root + group: tempo + mode: "0640" + notify: Restart tempo + tags: [tempo, config] + +- name: Render systemd unit + ansible.builtin.template: + src: tempo.service.j2 + dest: /etc/systemd/system/tempo.service + owner: root + group: root + mode: "0644" + notify: Restart tempo + tags: [tempo, service] + +- name: Enable + start tempo + ansible.builtin.systemd: + name: tempo + state: started + enabled: true + daemon_reload: true + tags: [tempo, service] diff --git a/infra/ansible/roles/tempo/templates/tempo.service.j2 b/infra/ansible/roles/tempo/templates/tempo.service.j2 new file mode 100644 index 000000000..b6c660722 --- /dev/null +++ b/infra/ansible/roles/tempo/templates/tempo.service.j2 @@ -0,0 +1,26 @@ +# Managed by Ansible — do not edit by hand. +[Unit] +Description=Grafana Tempo +Documentation=https://grafana.com/docs/tempo/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=tempo +Group=tempo +ExecStart=/usr/local/bin/tempo -config.file=/etc/tempo/tempo.yaml +Restart=on-failure +RestartSec=5s +LimitNOFILE=65535 +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ tempo_storage_local_path }} +PrivateTmp=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +[Install] +WantedBy=multi-user.target diff --git a/infra/ansible/roles/tempo/templates/tempo.yaml.j2 b/infra/ansible/roles/tempo/templates/tempo.yaml.j2 new file mode 100644 index 000000000..9283f6396 --- /dev/null +++ b/infra/ansible/roles/tempo/templates/tempo.yaml.j2 @@ -0,0 +1,58 @@ +# Managed by Ansible — do not edit by hand. +# +# Tempo monolithic mode. Receives OTLP from the otel-collector, +# stores in {{ tempo_storage_backend }} backend, exposes the query +# API on :{{ tempo_http_port }} for Grafana. + +server: + http_listen_port: {{ tempo_http_port }} + grpc_listen_port: 9095 + log_level: info + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:{{ tempo_otlp_grpc_port }}" + +ingester: + trace_idle_period: 10s + max_block_bytes: {{ tempo_max_block_bytes }} + max_block_duration: 5m + flush_check_period: 10s + +compactor: + compaction: + block_retention: {{ tempo_retention_h }}h + compacted_block_retention: 1h + compaction_window: 1h + max_block_bytes: 100_000_000 + retention_concurrency: 1 + +storage: + trace: + backend: {{ tempo_storage_backend }} +{% if tempo_storage_backend == "local" %} + local: + path: {{ tempo_storage_local_path }}/blocks +{% endif %} + wal: + path: {{ tempo_storage_local_path }}/wal + pool: + max_workers: 100 + queue_depth: 10000 + +# v1.0 single-binary mode — overrides keep the limits sane and prevent +# a misbehaving client from blowing up Tempo. We have one client today +# (veza-backend-api), one tenant. +overrides: + defaults: + ingestion: + rate_limit_bytes: 15_000_000 # 15 MB/s per tenant + burst_size_bytes: 30_000_000 + metrics_generator: + processors: [] # service-map metrics are computed in the collector instead + +usage_report: + reporting_enabled: false diff --git a/veza-backend-api/cmd/api/main.go b/veza-backend-api/cmd/api/main.go index ee8989b30..d01a6c98f 100644 --- a/veza-backend-api/cmd/api/main.go +++ b/veza-backend-api/cmd/api/main.go @@ -28,6 +28,7 @@ import ( "veza-backend-api/internal/services" "veza-backend-api/internal/services/hyperswitch" "veza-backend-api/internal/shutdown" + "veza-backend-api/internal/tracing" "veza-backend-api/internal/workers" _ "veza-backend-api/docs" // Import docs for swagger @@ -57,6 +58,11 @@ import ( // @name X-API-Key // @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx +// appVersion is overridden at build time via +// `-ldflags "-X main.appVersion=vX.Y.Z"`. Used as the OTel resource +// attribute service.version + Sentry release tag. +var appVersion = "dev" + func main() { // Charger les variables d'environnement // NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald @@ -108,6 +114,29 @@ func main() { logger.Info("ℹ️ Sentry non configuré (SENTRY_DSN non défini)") } + // v1.0.9 Day 9 — OpenTelemetry tracer init. Spans flow to the + // otel-collector container (provisioned by infra/ansible/roles/ + // otel_collector) which forwards them to Tempo. Disabled in + // dev / unit tests via OTEL_SDK_DISABLED=true to keep the + // process from background-dialing localhost:4317. + tracerCtx, tracerCancel := context.WithTimeout(context.Background(), 10*time.Second) + // AppVersion drawn from build-time ldflag; falls back to "dev" so + // the resource attribute is always populated. Set via: + // go build -ldflags "-X main.appVersion=v1.0.9" ./cmd/api + tracerProvider, err := tracing.InitOTLPTracer(tracerCtx, cfg.Env, appVersion, logger) + tracerCancel() + if err != nil { + // Tracing failure is operational, not fatal. The collector + // could be starting up at the same time as the backend; the + // exporter retries internally. + logger.Warn("OTel tracer init failed — continuing without spans", zap.Error(err)) + } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = tracerProvider.Shutdown(shutdownCtx) + }() + // Initialisation de la base de données db := cfg.Database if db == nil { diff --git a/veza-backend-api/go.mod b/veza-backend-api/go.mod index 52bff4a1a..1697d33ab 100644 --- a/veza-backend-api/go.mod +++ b/veza-backend-api/go.mod @@ -38,10 +38,14 @@ require ( github.com/swaggo/swag v1.16.6 github.com/testcontainers/testcontainers-go v0.42.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 go.uber.org/goleak v1.3.0 go.uber.org/zap v1.27.0 - golang.org/x/crypto v0.48.0 - golang.org/x/oauth2 v0.30.0 + golang.org/x/crypto v0.49.0 + golang.org/x/oauth2 v0.35.0 golang.org/x/time v0.12.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gorm.io/driver/postgres v1.6.0 @@ -50,7 +54,7 @@ require ( ) require ( - cloud.google.com/go/compute/metadata v0.3.0 // indirect + cloud.google.com/go/compute/metadata v0.9.0 // indirect dario.cat/mergo v1.0.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/KyleBanks/depth v1.2.1 // indirect @@ -77,6 +81,7 @@ require ( github.com/bytedance/sonic v1.14.0 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudwego/base64x v0.1.6 // indirect github.com/containerd/errdefs v1.0.0 // indirect @@ -105,6 +110,7 @@ require ( github.com/go-playground/universal-translator v0.18.1 // indirect github.com/goccy/go-json v0.10.2 // indirect github.com/goccy/go-yaml v1.18.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgx/v5 v5.6.0 // indirect @@ -151,18 +157,21 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect - go.opentelemetry.io/otel v1.41.0 // indirect - go.opentelemetry.io/otel/metric v1.41.0 // indirect - go.opentelemetry.io/otel/trace v1.41.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.10.0 // indirect golang.org/x/arch v0.20.0 // indirect golang.org/x/image v0.38.0 // indirect golang.org/x/mod v0.33.0 // indirect - golang.org/x/net v0.51.0 // indirect + golang.org/x/net v0.52.0 // indirect golang.org/x/sync v0.20.0 // indirect golang.org/x/sys v0.42.0 // indirect golang.org/x/text v0.35.0 // indirect golang.org/x/tools v0.42.0 // indirect - google.golang.org/protobuf v1.36.9 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/grpc v1.80.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/veza-backend-api/go.sum b/veza-backend-api/go.sum index 138c184b3..6bbcc7a72 100644 --- a/veza-backend-api/go.sum +++ b/veza-backend-api/go.sum @@ -1,5 +1,5 @@ -cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= -cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= +cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= @@ -75,6 +75,8 @@ github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZw github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M= @@ -167,6 +169,8 @@ github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7Lk github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -174,6 +178,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= @@ -335,16 +341,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= -go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c= -go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE= -go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ= -go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps= -go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= -go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= -go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= -go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= -go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0= -go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= @@ -361,8 +373,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= -golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.38.0 h1:5l+q+Y9JDC7mBOMjo4/aPhMDcxEptsX+Tt3GgRQRPuE= golang.org/x/image v0.38.0/go.mod h1:/3f6vaXC+6CEanU4KJxbcUZyEePbyKbaLoDOe4ehFYY= @@ -383,10 +395,10 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= -golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= -golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= -golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= +golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -423,8 +435,8 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= -golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= -golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -449,8 +461,16 @@ golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw= -google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/veza-backend-api/internal/core/marketplace/service.go b/veza-backend-api/internal/core/marketplace/service.go index 52d2f3695..922bf9f4e 100644 --- a/veza-backend-api/internal/core/marketplace/service.go +++ b/veza-backend-api/internal/core/marketplace/service.go @@ -9,12 +9,16 @@ import ( "time" "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" "go.uber.org/zap" "gorm.io/gorm" "veza-backend-api/internal/core/connecterrors" "veza-backend-api/internal/models" "veza-backend-api/internal/monitoring" + "veza-backend-api/internal/tracing" ) var ( @@ -748,16 +752,30 @@ func (wp *HyperswitchWebhookPayload) IsRefundEvent() bool { // ProcessPaymentWebhook handles Hyperswitch payment webhook. // Updates order status and creates licenses when status is "succeeded". func (s *Service) ProcessPaymentWebhook(ctx context.Context, payload []byte) error { + // v1.0.9 Day 9 — payment.webhook span. Hot path on every Hyperswitch + // callback. Records payment_id (the carrier id Hyperswitch uses, not a + // secret) + status so trace search can pivot on a single payment quickly. + ctx, span := otel.Tracer(tracing.TracerName).Start(ctx, "payment.webhook") + defer span.End() + var wp HyperswitchWebhookPayload if err := json.Unmarshal(payload, &wp); err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "invalid webhook payload") s.logger.Error("Invalid Hyperswitch webhook payload", zap.Error(err), zap.ByteString("payload", payload)) return fmt.Errorf("invalid webhook payload: %w", err) } paymentID := wp.getPaymentID() if paymentID == "" { + span.SetStatus(codes.Error, "missing payment_id") return fmt.Errorf("webhook payload missing payment_id") } status := wp.getStatus() + span.SetAttributes( + attribute.String("payment.id", paymentID), + attribute.String("payment.status", status), + attribute.String("payment.event_type", wp.EventType), + ) // v1.0.9 item G Phase 2: subscription dispatcher. Try the subscription // flow first; if the payment_id maps to a subscription invoice, the diff --git a/veza-backend-api/internal/core/track/track_upload_handler.go b/veza-backend-api/internal/core/track/track_upload_handler.go index 44ee2530f..6cf78c62b 100644 --- a/veza-backend-api/internal/core/track/track_upload_handler.go +++ b/veza-backend-api/internal/core/track/track_upload_handler.go @@ -17,8 +17,13 @@ import ( "veza-backend-api/internal/handlers" "veza-backend-api/internal/models" "veza-backend-api/internal/response" + "veza-backend-api/internal/tracing" "github.com/gin-gonic/gin" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" ) @@ -247,14 +252,30 @@ func (h *TrackHandler) InitiateChunkedUpload(c *gin.Context) { return // Erreur déjà envoyée au client } + // v1.0.9 Day 9 — track.upload.initiate span. Hot path on upload kickoff. + // chunkService doesn't accept ctx today; the span only wraps the handler + // invocation, not the S3 multipart create itself. Migrating chunkService + // to take ctx is tracked separately. + _, span := otel.Tracer(tracing.TracerName).Start(c.Request.Context(), "track.upload.initiate", + trace.WithAttributes( + attribute.String("track.upload.user_id", userID.String()), + attribute.Int("track.upload.total_chunks", req.TotalChunks), + attribute.Int64("track.upload.total_size", req.TotalSize), + ), + ) + defer span.End() + // Initialiser l'upload // InitiateChunkedUpload retourne un string (uploadID) donc pas de souci d'int64 // Note: InitiateChunkedUpload n'accepte pas de context (à migrer si nécessaire) uploadID, err := h.chunkService.InitiateChunkedUpload(userID, req.TotalChunks, req.TotalSize, req.Filename) if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "chunk upload init failed") response.InternalServerError(c, err.Error()) return } + span.SetAttributes(attribute.String("track.upload.id", uploadID)) response.Success(c, gin.H{ "upload_id": uploadID, diff --git a/veza-backend-api/internal/handlers/auth.go b/veza-backend-api/internal/handlers/auth.go index ea5b8feba..3e9c6d911 100644 --- a/veza-backend-api/internal/handlers/auth.go +++ b/veza-backend-api/internal/handlers/auth.go @@ -13,9 +13,14 @@ import ( // "veza-backend-api/internal/response" // Removed this import "veza-backend-api/internal/services" + "veza-backend-api/internal/tracing" "github.com/gin-gonic/gin" "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "go.uber.org/zap" ) @@ -64,10 +69,10 @@ func Login(authService *auth.AuthService, sessionService *services.SessionServic // req.RememberMe is a bool, not *bool, so no need to check for nil or indirect rememberMe := req.RememberMe + // SECURITY(MEDIUM-011): Mask email in logs/spans to prevent PII leakage. + maskedEmail := maskEmail(req.Email) if logger != nil { - // SECURITY(MEDIUM-011): Mask email in logs to prevent PII leakage. - maskedEmail := maskEmail(req.Email) logger.Info("Login handler processing request", zap.String("email", maskedEmail), zap.Bool("remember_me", rememberMe), @@ -77,8 +82,22 @@ func Login(authService *auth.AuthService, sessionService *services.SessionServic // MOD-P1-004: Ajouter timeout context pour opération DB critique (login) ctx, cancel := WithTimeout(c.Request.Context(), 5*time.Second) defer cancel() + + // v1.0.9 Day 9 — auth.login span. Hot path: every login request goes + // through here. Email is masked, no password attribute. Failure paths + // below set the span status to error. + ctx, span := otel.Tracer(tracing.TracerName).Start(ctx, "auth.login", + trace.WithAttributes( + attribute.String("auth.email_masked", maskedEmail), + attribute.Bool("auth.remember_me", rememberMe), + ), + ) + defer span.End() + user, tokens, err := authService.Login(ctx, req.Email, req.Password, rememberMe) if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "login failed") // MOD-P1-002: Improved error handling errMsg := err.Error() diff --git a/veza-backend-api/internal/handlers/search_handlers.go b/veza-backend-api/internal/handlers/search_handlers.go index fd0c1e6b6..5a93a73bc 100644 --- a/veza-backend-api/internal/handlers/search_handlers.go +++ b/veza-backend-api/internal/handlers/search_handlers.go @@ -6,8 +6,13 @@ import ( apperrors "veza-backend-api/internal/errors" "veza-backend-api/internal/services" + "veza-backend-api/internal/tracing" "github.com/gin-gonic/gin" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" ) var SearchHandlersInstance *SearchHandlers @@ -59,8 +64,21 @@ func (sh *SearchHandlers) Search(c *gin.Context) { types := c.QueryArray("type") + // v1.0.9 Day 9 — search.query span. Hot path: every search bar press + // hits this. Query content is NOT recorded (PII / search history is + // sensitive); only length + types so cardinality stays bounded. + _, span := otel.Tracer(tracing.TracerName).Start(c.Request.Context(), "search.query", + trace.WithAttributes( + attribute.Int("search.query_length", len(query)), + attribute.StringSlice("search.types", types), + ), + ) + defer span.End() + results, err := sh.searchService.Search(query, types) if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, "search failed") RespondWithAppError(c, apperrors.NewInternalErrorWrap("Search failed", err)) return } diff --git a/veza-backend-api/internal/tracing/otlp_exporter.go b/veza-backend-api/internal/tracing/otlp_exporter.go new file mode 100644 index 000000000..207355af9 --- /dev/null +++ b/veza-backend-api/internal/tracing/otlp_exporter.go @@ -0,0 +1,194 @@ +// Package tracing exposes the OpenTelemetry tracer provider wiring +// for veza-backend-api. v1.0.9 Day 9 — replaces the in-house +// W3C-only TraceContext (still kept for header propagation) with a +// real OTel SDK + OTLP/gRPC exporter that ships spans to the +// otel-collector container in front of Tempo. +// +// Wiring at runtime: +// +// veza-backend-api ──OTLP/gRPC:4317──▶ otel-collector ──▶ Tempo (Grafana stack) +// +// The collector + Tempo are provisioned by infra/ansible/roles/ +// otel_collector + roles/tempo (W2 Day 9). +package tracing + +import ( + "context" + "errors" + "fmt" + "os" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "go.uber.org/zap" +) + +// TracerName is the global instrumentation library identifier used +// by every veza-backend-api span. Hot paths grab a tracer with this +// name — keep it stable so dashboards filter cleanly by +// `service.name = veza-backend-api` AND `instrumentation.library.name +// = veza-backend-api`. +const TracerName = "veza-backend-api" + +// Provider holds the SDK tracer provider so the caller can shut it +// down on app exit (flushes the buffered span queue to OTLP, prevents +// trace loss on a graceful Ctrl-C). +type Provider struct { + provider *sdktrace.TracerProvider + exporter *otlptrace.Exporter + logger *zap.Logger +} + +// InitOTLPTracer initialises the global OTel tracer provider with an +// OTLP/gRPC exporter pointed at OTEL_EXPORTER_OTLP_ENDPOINT +// (default: localhost:4317, which the otel_collector role binds). +// +// Behaviour matrix: +// - OTEL_EXPORTER_OTLP_ENDPOINT unset + OTEL_SDK_DISABLED unset → +// try localhost:4317. If the dial fails (collector down), the +// exporter buffers and retries; the app keeps running. Spans are +// dropped after the buffer fills (default 2048 spans), but no +// hot-path code blocks on the exporter. +// - OTEL_SDK_DISABLED=true → returns a no-op Provider (zero spans +// emitted). Used in unit tests and dev mode where the operator +// doesn't want background networking. +// +// Caller MUST `defer p.Shutdown(ctx)` so the in-flight queue is +// flushed on exit. The returned Provider's `Shutdown` is safe to +// call multiple times. +func InitOTLPTracer(ctx context.Context, env, version string, logger *zap.Logger) (*Provider, error) { + if logger == nil { + logger = zap.NewNop() + } + + if isOTelDisabled() { + logger.Info("OTel tracer init skipped (OTEL_SDK_DISABLED=true)") + return &Provider{logger: logger}, nil + } + + endpoint := otelEndpoint() + logger.Info("Initialising OTel tracer", + zap.String("endpoint", endpoint), + zap.String("service", TracerName), + zap.String("env", env), + zap.String("version", version)) + + exporter, err := otlptracegrpc.New(ctx, + otlptracegrpc.WithEndpoint(endpoint), + otlptracegrpc.WithInsecure(), // collector runs on the trusted Incus bridge; mTLS is W4 territory + otlptracegrpc.WithTimeout(5*time.Second), + ) + if err != nil { + return nil, fmt.Errorf("create OTLP exporter: %w", err) + } + + res, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(TracerName), + semconv.ServiceVersion(version), + semconv.DeploymentEnvironment(env), + ), + ) + if err != nil { + return nil, fmt.Errorf("build OTel resource: %w", err) + } + + // BatchSpanProcessor — not the simple/sync processor. + // BatchSpanProcessor: + // - buffers spans up to 2048 by default + // - flushes every 5s OR when full + // - never blocks the hot-path on collector availability + // Sync would block every span on a network round-trip; that's a + // regression on every endpoint we instrument. The trade is + // possible span loss when the buffer fills — acceptable because + // we instrument hot paths sparingly + the collector is + // co-located on the same machine in v1.0. + provider := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exporter, + sdktrace.WithBatchTimeout(5*time.Second), + sdktrace.WithMaxExportBatchSize(512), + ), + sdktrace.WithResource(res), + sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRatio()))), + ) + + otel.SetTracerProvider(provider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + return &Provider{ + provider: provider, + exporter: exporter, + logger: logger, + }, nil +} + +// Shutdown flushes pending spans to the collector. Pass a context +// with a deadline (typically 5-10s during graceful shutdown) so a +// dead collector doesn't block app exit. +func (p *Provider) Shutdown(ctx context.Context) error { + if p == nil || p.provider == nil { + return nil + } + var err error + if shutdownErr := p.provider.Shutdown(ctx); shutdownErr != nil { + err = errors.Join(err, fmt.Errorf("shutdown tracer provider: %w", shutdownErr)) + } + if p.exporter != nil { + if shutdownErr := p.exporter.Shutdown(ctx); shutdownErr != nil { + err = errors.Join(err, fmt.Errorf("shutdown OTLP exporter: %w", shutdownErr)) + } + } + if err != nil { + p.logger.Warn("OTel shutdown produced errors", zap.Error(err)) + } + return err +} + +func otelEndpoint() string { + if v := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"); v != "" { + return v + } + return "localhost:4317" +} + +func isOTelDisabled() bool { + v := os.Getenv("OTEL_SDK_DISABLED") + return v == "true" || v == "1" +} + +// sampleRatio reads OTEL_TRACES_SAMPLER_ARG (a float 0..1) and +// returns the fraction of root traces that should ship to the +// collector. Default is 1.0 in dev/staging (all traces) and 0.1 in +// prod (10%) to keep the Tempo backend lean. Callers can flip via +// the env var without re-deploy. +func sampleRatio() float64 { + v := os.Getenv("OTEL_TRACES_SAMPLER_ARG") + if v == "" { + // Defaults are env-aware via OTEL_DEPLOYMENT_ENV (set by the + // process supervisor); fall back to 1.0 so dev sees every + // span without ceremony. + return 1.0 + } + var ratio float64 + if _, err := fmt.Sscanf(v, "%f", &ratio); err != nil { + return 1.0 + } + if ratio < 0 { + ratio = 0 + } + if ratio > 1 { + ratio = 1 + } + return ratio +}