feat(observability): OTel SDK + collector + Tempo + 4 hot path spans (W2 Day 9)
Some checks failed
Veza CI / Notify on failure (push) Blocked by required conditions
Security Scan / Secret Scanning (gitleaks) (push) Waiting to run
Veza CI / Backend (Go) (push) Has been cancelled
Veza CI / Rust (Stream Server) (push) Has been cancelled
Veza CI / Frontend (Web) (push) Has been cancelled
E2E Playwright / e2e (full) (push) Has been cancelled

Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a
collector, which tail-samples (errors + slow always, 10% rest) and
ships to Tempo. Grafana service-map dashboard pivots on the 4
instrumented hot paths.

- internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown,
  BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler,
  W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true
  short-circuits to a no-op. Failure to dial collector is non-fatal.
- cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion
  ldflag-overridable for resource attributes.
- 4 hot paths instrumented :
    * handlers/auth.go::Login           → "auth.login"
    * core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate"
    * core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook"
    * handlers/search_handlers.go::Search → "search.query"
  PII guarded — email masked, query content not recorded (length only).
- infra/ansible/roles/otel_collector : pin v0.116.1 contrib build,
  systemd unit, tail-sampling config (errors + > 500ms always kept).
- infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend
  (S3 deferred to v1.1), 14d retention.
- infra/ansible/playbooks/observability.yml : provisions both Incus
  containers + applies common baseline + roles in order.
- inventory/lab.yml : new groups observability, otel_collectors, tempo.
- config/grafana/dashboards/service-map.json : node graph + 4 hot-path
  span tables + collector throughput/queue panels.
- docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented.

Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab
deployment to validate with `ansible-playbook -i inventory/lab.yml
playbooks/observability.yml` once roles/postgres_ha is up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
senke 2026-04-28 01:15:11 +02:00
parent bf31a91ae6
commit 84e92a75e2
24 changed files with 1139 additions and 34 deletions

View file

@ -0,0 +1,101 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [
{ "title": "Tempo data source", "type": "link", "url": "/explore?left=%7B%22datasource%22:%22tempo%22%7D" }
],
"liveNow": false,
"panels": [
{
"datasource": { "type": "tempo", "uid": "tempo" },
"gridPos": { "h": 14, "w": 24, "x": 0, "y": 0 },
"id": 1,
"type": "nodeGraph",
"title": "Service map (last 1h)",
"options": {},
"targets": [
{
"queryType": "serviceMap",
"refId": "A",
"datasource": { "type": "tempo", "uid": "tempo" }
}
]
},
{
"datasource": { "type": "tempo", "uid": "tempo" },
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 14 },
"id": 2,
"type": "table",
"title": "Slowest spans (auth.login + track.upload.initiate + payment.webhook + search.query)",
"options": { "showHeader": true },
"fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] },
"targets": [
{
"query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\"} | by(name) | aggregate(max(duration))",
"queryType": "traceql",
"tableType": "spans",
"refId": "A",
"datasource": { "type": "tempo", "uid": "tempo" }
}
]
},
{
"datasource": { "type": "tempo", "uid": "tempo" },
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 14 },
"id": 3,
"type": "table",
"title": "Recent errors on hot paths",
"options": { "showHeader": true },
"fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] },
"targets": [
{
"query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\" && status=error} | by(name)",
"queryType": "traceql",
"tableType": "spans",
"refId": "A",
"datasource": { "type": "tempo", "uid": "tempo" }
}
]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
"id": 4,
"type": "timeseries",
"title": "OTel collector — accepted vs refused spans",
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] },
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
"targets": [
{ "expr": "sum(rate(otelcol_receiver_accepted_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "accepted", "refId": "A" },
{ "expr": "sum(rate(otelcol_receiver_refused_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "refused", "refId": "B" }
]
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
"id": 5,
"type": "timeseries",
"title": "OTel collector — exporter queue depth",
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] },
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
"targets": [
{ "expr": "otelcol_exporter_queue_size", "legendFormat": "{{exporter}}", "refId": "A" },
{ "expr": "otelcol_exporter_queue_capacity", "legendFormat": "{{exporter}} capacity", "refId": "B" }
]
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["veza", "tracing", "tempo"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Veza Service Map (Tempo)",
"uid": "veza-service-map",
"version": 1
}

View file

@ -56,7 +56,8 @@ Tout le reste a un défaut raisonnable ou est opt-in.
27. [Variables dépréciées / legacy](#27-variables-dépréciées--legacy) 27. [Variables dépréciées / legacy](#27-variables-dépréciées--legacy)
28. [Règles de validation production](#28-règles-de-validation-production) 28. [Règles de validation production](#28-règles-de-validation-production)
29. [Drift template ↔ code](#29-drift-template--code) 29. [Drift template ↔ code](#29-drift-template--code)
30. [Checklist de démarrage](#30-checklist-de-démarrage) 30. [OpenTelemetry / distributed tracing](#30-opentelemetry--distributed-tracing-v109-day-9)
31. [Checklist de démarrage](#31-checklist-de-démarrage)
**Légende** : **variable en gras** = critique en production (validée au boot). **Légende** : **variable en gras** = critique en production (validée au boot).
@ -543,7 +544,22 @@ Survey 2026-04-23 a identifié des incohérences entre `.env.template` et le cod
**Incohérence de nommage** : `SMTP_USERNAME` canonique vs `SMTP_USER` legacy ; `DB_MAX_*` code vs `DATABASE_MAX_*` template. **Incohérence de nommage** : `SMTP_USERNAME` canonique vs `SMTP_USER` legacy ; `DB_MAX_*` code vs `DATABASE_MAX_*` template.
## 30. Checklist de démarrage ## 30. OpenTelemetry / distributed tracing (v1.0.9 Day 9)
Quatre variables consommées par `veza-backend-api/internal/tracing/otlp_exporter.go` au boot. Toutes optionnelles — non set = comportement par défaut documenté.
| Variable | Défaut | Effet |
| --- | --- | --- |
| `OTEL_EXPORTER_OTLP_ENDPOINT` | `localhost:4317` | gRPC endpoint de l'otel-collector. En prod : `otel-collector.lxd:4317`. |
| `OTEL_SDK_DISABLED` | `false` | `true` ou `1` → no-op tracer (zero spans émis). Utile en tests unitaires + dev local sans collector. |
| `OTEL_TRACES_SAMPLER_ARG` | `1.0` | Fraction de traces root samplées côté SDK (0..1). Prod recommandé `1.0` puisque le collector applique son propre tail-sampling derrière. |
| `OTEL_DEPLOYMENT_ENV` | (none) | Override de `cfg.Env` pour le `deployment.environment` resource attribute. Rarement utile. |
Le binaire **ne crashe pas** si le collector est down : l'exporter bufferise puis retry. Spans sont droppés au-delà de 2048 en buffer.
Hot paths instrumentés (v1.0.9) : `auth.login`, `track.upload.initiate`, `payment.webhook`, `search.query`. Voir `infra/ansible/roles/{otel_collector,tempo}/README.md` pour le déploiement de la pipeline.
## 31. Checklist de démarrage
1. Copier `veza-backend-api/.env.template` vers `veza-backend-api/.env` et configurer. 1. Copier `veza-backend-api/.env.template` vers `veza-backend-api/.env` et configurer.
2. Pour RS256 prod : exécuter `scripts/generate-jwt-keys.sh` et configurer `JWT_PRIVATE_KEY_PATH`, `JWT_PUBLIC_KEY_PATH`. Sinon `JWT_SECRET` ≥32 chars. 2. Pour RS256 prod : exécuter `scripts/generate-jwt-keys.sh` et configurer `JWT_PRIVATE_KEY_PATH`, `JWT_PUBLIC_KEY_PATH`. Sinon `JWT_SECRET` ≥32 chars.

View file

@ -56,3 +56,19 @@ all:
vars: vars:
ansible_connection: community.general.incus ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3 ansible_python_interpreter: /usr/bin/python3
# v1.0.9 Day 9: otel-collector + Tempo for distributed tracing.
# Each runs in its own Incus container; the API on the host points
# at otel-collector.lxd:4317 via OTEL_EXPORTER_OTLP_ENDPOINT.
observability:
hosts:
otel-collector:
tempo:
vars:
ansible_connection: community.general.incus
ansible_python_interpreter: /usr/bin/python3
otel_collectors:
hosts:
otel-collector:
tempo:
hosts:
tempo:

View file

@ -0,0 +1,71 @@
# Observability playbook — provisions Incus containers for the
# trace pipeline and lays down otel-collector + tempo on top.
#
# Topology:
# otel-collector — receives OTLP from veza-backend-api, samples + ships
# tempo — stores traces, queried by Grafana
#
# Both run on the same Incus host today (veza-lab). When v1.1 splits
# observability onto a dedicated host, the only change here is the
# inventory; the playbook stays the same.
#
# Run with:
# ansible-galaxy collection install community.general
# ansible-playbook -i inventory/lab.yml playbooks/observability.yml --check
# ansible-playbook -i inventory/lab.yml playbooks/observability.yml
---
- name: Provision Incus containers for the trace pipeline
hosts: incus_hosts
become: true
gather_facts: true
tasks:
- name: Launch otel-collector + tempo
ansible.builtin.shell:
cmd: |
set -e
for ct in otel-collector tempo; do
if ! incus info "$ct" >/dev/null 2>&1; then
incus launch images:ubuntu/22.04 "$ct"
# Wait for cloud-init.
for _ in $(seq 1 30); do
if incus exec "$ct" -- cloud-init status 2>/dev/null | grep -q "status: done"; then
break
fi
sleep 1
done
incus exec "$ct" -- apt-get update
incus exec "$ct" -- apt-get install -y python3 python3-apt
fi
done
args:
executable: /bin/bash
register: provision_result
changed_when: "'incus launch' in provision_result.stdout"
tags: [observability, provision]
- name: Refresh inventory so the new containers are reachable
ansible.builtin.meta: refresh_inventory
- name: Apply common baseline to observability containers
hosts: observability
become: true
gather_facts: true
roles:
- common
# Tempo first — the collector depends on it being reachable. Ansible
# runs roles in declaration order, but we put them on separate plays
# anyway because the collector needs Tempo's port open to fully start.
- name: Install + configure Tempo
hosts: tempo
become: true
gather_facts: true
roles:
- tempo
- name: Install + configure otel-collector
hosts: otel_collectors
become: true
gather_facts: true
roles:
- otel_collector

View file

@ -0,0 +1,54 @@
# `otel_collector` role — OpenTelemetry collector in front of Tempo
Installs `opentelemetry-collector-contrib` (pinned via `otel_collector_version`) as a systemd service, renders a config that receives OTLP/gRPC from `veza-backend-api`, applies a tail-based sampler, and ships traces to Tempo.
## Why a collector instead of API → Tempo direct
- **Sampling decisions are server-side.** The API can't know if a trace had errors at the moment it ships its first span; the collector buffers a trace for 5s, then keeps it (errors + > 500ms) or drops it.
- **Retry buffering.** If Tempo is down for 30s, the collector retries; the API doesn't have to.
- **Cardinality fences.** The transform processor can drop high-cardinality attributes before they reach Tempo if a future regression sneaks one in.
## Pipeline
```
veza-backend-api ──OTLP/gRPC:4317──▶ otel-collector ──OTLP/gRPC:4319──▶ Tempo
└─── self-metrics → Prometheus :8888
```
Processors in order:
1. `memory_limiter` (256 MiB cap)
2. `resourcedetection` (host.name, host.id from /etc/machine-id)
3. `tail_sampling` (errors + slow always; rest at `otel_collector_tail_sample_ok_pct`%)
4. `batch` (1s flush, 8192 spans)
## Defaults
| variable | default | meaning |
| --------------------------------------- | -------------------- | ------------------------------------------------- |
| `otel_collector_version` | `0.116.1` | release tag from `opentelemetry-collector-releases` |
| `otel_collector_grpc_port` | `4317` | OTLP/gRPC listener |
| `otel_collector_http_port` | `4318` | OTLP/HTTP listener (kept open for browser SDKs) |
| `otel_collector_tempo_endpoint` | `tempo.lxd:4319` | Tempo OTLP gRPC |
| `otel_collector_tail_sample_ok_pct` | `10` | % of healthy traces kept |
| `otel_collector_memory_limit_mib` | `256` | hard cap |
## Operations
```bash
# Status:
sudo systemctl status otel-collector
sudo journalctl -u otel-collector -f
# Health:
curl -fsS http://otel-collector.lxd:13133
# Self-metrics (collector throughput):
curl -fsS http://otel-collector.lxd:8888/metrics | grep otelcol_
```
## What this role does NOT cover
- **mTLS between API/collector/Tempo.** `tls.insecure: true` everywhere — the security boundary is the Incus bridge for v1.0. W4 swaps in cert-manager-issued certs.
- **Multi-region collector mesh.** Single-host deploy. v1.1+ adds a second collector behind HAProxy.

View file

@ -0,0 +1,28 @@
# otel_collector defaults — pin opentelemetry-collector-contrib to a
# known-good release. The "contrib" distribution is required because
# we need a few non-core processors (filter, transform). Override
# `otel_collector_version` per-env if you want a different release.
---
otel_collector_version: "0.116.1"
otel_collector_arch: amd64
# Where the collector listens for spans from veza-backend-api. The
# backend default is localhost:4317 — flip both if you split hosts.
otel_collector_grpc_port: 4317
otel_collector_http_port: 4318
# Tempo upstream. The Tempo container (roles/tempo) listens on its own
# OTLP gRPC port (default 4319 — distinct so the collector and Tempo
# don't fight over 4317 when colocated on the same host).
otel_collector_tempo_endpoint: "tempo.lxd:4319"
# Sample everything in dev/staging. In prod the collector applies a
# tail-based sampler (config below) that keeps 100% of error spans and
# 10% of healthy ones.
otel_collector_tail_sample_error_pct: 100
otel_collector_tail_sample_ok_pct: 10
# Resource limits — the collector is co-located with the API on the
# Incus host, so we cap it to 256 MiB heap to avoid memory pressure.
otel_collector_memory_limit_mib: 256
otel_collector_memory_spike_limit_mib: 64

View file

@ -0,0 +1,6 @@
---
- name: Restart otel-collector
ansible.builtin.systemd:
name: otel-collector
state: restarted
daemon_reload: true

View file

@ -0,0 +1,96 @@
# otel_collector role — installs opentelemetry-collector-contrib as a
# tarball under /opt, drops the systemd unit, renders the config, and
# starts it. Idempotent. Designed to run in an Incus container so the
# collector can be restarted independently of the API process.
---
- name: Ensure /opt/otelcol-contrib exists
ansible.builtin.file:
path: /opt/otelcol-contrib
state: directory
owner: root
group: root
mode: "0755"
tags: [otel_collector, install]
- name: Check installed otelcol version
ansible.builtin.stat:
path: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}"
register: otelcol_installed
tags: [otel_collector, install]
- name: Download opentelemetry-collector-contrib tarball
ansible.builtin.get_url:
url: "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v{{ otel_collector_version }}/otelcol-contrib_{{ otel_collector_version }}_linux_{{ otel_collector_arch }}.tar.gz"
dest: "/tmp/otelcol-contrib-{{ otel_collector_version }}.tar.gz"
mode: "0644"
when: not otelcol_installed.stat.exists
tags: [otel_collector, install]
- name: Extract collector binary into versioned slot
ansible.builtin.unarchive:
src: "/tmp/otelcol-contrib-{{ otel_collector_version }}.tar.gz"
dest: /opt/otelcol-contrib
remote_src: true
creates: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}"
extra_opts:
- "--transform=s|^otelcol-contrib$|otelcol-contrib-{{ otel_collector_version }}|"
when: not otelcol_installed.stat.exists
tags: [otel_collector, install]
# /usr/local/bin/otelcol-contrib symlink → versioned binary. Lets us
# bump the version by changing only `otel_collector_version` and
# re-running the role; systemd unit doesn't change.
- name: Symlink /usr/local/bin/otelcol-contrib → versioned binary
ansible.builtin.file:
src: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}"
dest: /usr/local/bin/otelcol-contrib
state: link
force: true
notify: Restart otel-collector
tags: [otel_collector, install]
- name: Create otel-collector system user
ansible.builtin.user:
name: otelcol
system: true
home: /var/lib/otel-collector
shell: /usr/sbin/nologin
create_home: true
tags: [otel_collector, install]
- name: Ensure /etc/otel-collector exists
ansible.builtin.file:
path: /etc/otel-collector
state: directory
owner: root
group: otelcol
mode: "0750"
tags: [otel_collector, config]
- name: Render collector config
ansible.builtin.template:
src: otel-collector.yaml.j2
dest: /etc/otel-collector/otel-collector.yaml
owner: root
group: otelcol
mode: "0640"
notify: Restart otel-collector
tags: [otel_collector, config]
- name: Render systemd unit
ansible.builtin.template:
src: otel-collector.service.j2
dest: /etc/systemd/system/otel-collector.service
owner: root
group: root
mode: "0644"
notify: Restart otel-collector
tags: [otel_collector, service]
- name: Enable + start otel-collector
ansible.builtin.systemd:
name: otel-collector
state: started
enabled: true
daemon_reload: true
tags: [otel_collector, service]

View file

@ -0,0 +1,27 @@
# Managed by Ansible — do not edit by hand.
[Unit]
Description=OpenTelemetry Collector (contrib)
Documentation=https://opentelemetry.io/docs/collector/
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=otelcol
Group=otelcol
ExecStart=/usr/local/bin/otelcol-contrib --config=/etc/otel-collector/otel-collector.yaml
Restart=on-failure
RestartSec=5s
LimitNOFILE=65535
# Hardening — same baseline as the other Ansible-managed daemons.
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/otel-collector
PrivateTmp=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,97 @@
# Managed by Ansible — do not edit by hand.
#
# opentelemetry-collector-contrib config.
# Pipeline: OTLP/gRPC receiver → batch + tail_sampling + memory_limiter
# → OTLP exporter to Tempo (gRPC).
#
# Tail sampling keeps every error span and {{ otel_collector_tail_sample_ok_pct }}%
# of healthy spans, which is what we want in prod (Tempo storage isn't
# free). Override percentages in inventory group_vars.
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:{{ otel_collector_grpc_port }}"
http:
endpoint: "0.0.0.0:{{ otel_collector_http_port }}"
processors:
# memory_limiter goes FIRST so it can reject spans before the rest of
# the pipeline allocates anything. Without it the collector OOMs
# silently when the API bursts.
memory_limiter:
check_interval: 1s
limit_mib: {{ otel_collector_memory_limit_mib }}
spike_limit_mib: {{ otel_collector_memory_spike_limit_mib }}
# batch — flushes every 1s OR when 8192 spans queue up. The exporter
# likes batches, but we don't want a single span to wait > 1s on a
# quiet system.
batch:
timeout: 1s
send_batch_size: 8192
send_batch_max_size: 16384
# tail_sampling — see policies below. Decision wait is 5s: spans are
# buffered for 5s after their trace's first span lands, then a
# decision (keep / drop) is taken.
tail_sampling:
decision_wait: 5s
num_traces: 50000
expected_new_traces_per_sec: 100
policies:
# ALWAYS keep error traces — they're how we debug prod.
- name: keep-errors
type: status_code
status_code:
status_codes: [ERROR]
# ALWAYS keep slow traces (> 500ms). Catches latency spikes even
# when the request "succeeded".
- name: keep-slow
type: latency
latency:
threshold_ms: 500
# Sample remaining healthy traces at the env percentage.
- name: sample-rest
type: probabilistic
probabilistic:
sampling_percentage: {{ otel_collector_tail_sample_ok_pct }}
# resourcedetection — best-effort attribute enrichment so spans carry
# host.name and host.id even if the SDK forgot to add them.
resourcedetection:
detectors: [system, env]
timeout: 2s
override: false
exporters:
otlp/tempo:
endpoint: "{{ otel_collector_tempo_endpoint }}"
tls:
insecure: true # mTLS is W4 territory; the Incus bridge is the security boundary
# debug exporter — useful in lab; in prod set verbosity: basic so it
# stays quiet. Uncomment in the service.pipelines block to enable.
debug:
verbosity: basic
extensions:
health_check:
endpoint: "0.0.0.0:13133"
pprof:
endpoint: "127.0.0.1:1777"
service:
extensions: [health_check, pprof]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, resourcedetection, tail_sampling, batch]
exporters: [otlp/tempo]
telemetry:
logs:
level: info
metrics:
level: basic
address: "0.0.0.0:8888" # collector self-metrics scraped by Prometheus

View file

@ -0,0 +1,50 @@
# `tempo` role — Grafana Tempo trace backend
Single-binary Tempo (monolithic mode), local-disk storage, ~14 day retention. Receives OTLP/gRPC from `roles/otel_collector`, exposes the query API on `:3200` for Grafana.
## Topology
```
otel-collector ──OTLP/gRPC:4319──▶ tempo ──HTTP:3200──▶ Grafana data source
└─── /var/lib/tempo (blocks + WAL)
```
## Defaults
| variable | default | meaning |
| --------------------------- | -------------------- | ---------------------------- |
| `tempo_version` | `2.7.1` | release tag |
| `tempo_otlp_grpc_port` | `4319` | OTLP/gRPC listener |
| `tempo_http_port` | `3200` | query API |
| `tempo_storage_backend` | `local` | `local` (v1.0) or `s3` (v1.1+) |
| `tempo_storage_local_path` | `/var/lib/tempo` | block + WAL root |
| `tempo_retention_h` | `336` (14d) | block retention |
## Operations
```bash
# Status:
sudo systemctl status tempo
sudo journalctl -u tempo -f
# Health:
curl -fsS http://tempo.lxd:3200/ready
curl -fsS http://tempo.lxd:3200/metrics | grep tempo_
# Query a trace by ID:
curl -fsS "http://tempo.lxd:3200/api/traces/<trace_id>"
# Search recent traces by service:
curl -fsS "http://tempo.lxd:3200/api/search?tags=service.name=veza-backend-api"
```
## Grafana data source
In Grafana, add a Tempo data source pointing at `http://tempo.lxd:3200`. The service map in `config/grafana/dashboards/service-map.json` (W2 Day 9) is wired to this data source by name `tempo`.
## What this role does NOT cover
- **S3-backed storage.** v1.0 = local disk, single-host. v1.1 swaps `storage.trace.backend: s3` to ship blocks to MinIO so Tempo can run multi-replica.
- **Multi-tenancy.** Single tenant (`single-tenant`) until v1.2 brings hosted multi-tenancy in.
- **Metrics generator.** Service-map metrics are computed in the collector pipeline (cheaper than Tempo's `metrics_generator`).

View file

@ -0,0 +1,25 @@
# Tempo defaults — single-binary mode (monolithic), local backend on
# the container's filesystem. Plenty for v1.0; W3+ moves to S3
# (the same MinIO bucket the rest of the stack uses).
---
tempo_version: "2.7.1"
tempo_arch: amd64
# Where Tempo listens for spans from the otel-collector. The collector
# default in roles/otel_collector points at tempo.lxd:4319, so keep
# them in sync.
tempo_otlp_grpc_port: 4319
# Tempo's own HTTP API (Grafana data source uses this).
tempo_http_port: 3200
# Storage. v1.0 = local disk. v1.1 = S3 (MinIO bucket veza-tempo).
tempo_storage_backend: local
tempo_storage_local_path: /var/lib/tempo
# Retention — Tempo doesn't compact aggressively; 14d default.
tempo_retention_h: 336 # 14 days
# Resource sizing — see https://grafana.com/docs/tempo/latest/setup/
# defaults are tuned for ~5k spans/sec which is way more than v1.0
# traffic. Override if the API gets popular.
tempo_max_block_bytes: 524288000 # 500 MiB

View file

@ -0,0 +1,6 @@
---
- name: Restart tempo
ansible.builtin.systemd:
name: tempo
state: restarted
daemon_reload: true

View file

@ -0,0 +1,100 @@
# Tempo role — installs the single-binary distribution under /opt,
# renders monolithic config, sets up systemd. Idempotent.
---
- name: Ensure /opt/tempo exists
ansible.builtin.file:
path: /opt/tempo
state: directory
owner: root
group: root
mode: "0755"
tags: [tempo, install]
- name: Check installed Tempo version
ansible.builtin.stat:
path: "/opt/tempo/tempo-{{ tempo_version }}"
register: tempo_installed
tags: [tempo, install]
- name: Download Tempo tarball
ansible.builtin.get_url:
url: "https://github.com/grafana/tempo/releases/download/v{{ tempo_version }}/tempo_{{ tempo_version }}_linux_{{ tempo_arch }}.tar.gz"
dest: "/tmp/tempo-{{ tempo_version }}.tar.gz"
mode: "0644"
when: not tempo_installed.stat.exists
tags: [tempo, install]
- name: Extract Tempo binary into versioned slot
ansible.builtin.unarchive:
src: "/tmp/tempo-{{ tempo_version }}.tar.gz"
dest: /opt/tempo
remote_src: true
creates: "/opt/tempo/tempo-{{ tempo_version }}"
extra_opts:
- "--transform=s|^tempo$|tempo-{{ tempo_version }}|"
when: not tempo_installed.stat.exists
tags: [tempo, install]
- name: Symlink /usr/local/bin/tempo → versioned binary
ansible.builtin.file:
src: "/opt/tempo/tempo-{{ tempo_version }}"
dest: /usr/local/bin/tempo
state: link
force: true
notify: Restart tempo
tags: [tempo, install]
- name: Create tempo system user
ansible.builtin.user:
name: tempo
system: true
home: "{{ tempo_storage_local_path }}"
shell: /usr/sbin/nologin
create_home: true
tags: [tempo, install]
- name: Ensure storage directory ownership
ansible.builtin.file:
path: "{{ tempo_storage_local_path }}"
state: directory
owner: tempo
group: tempo
mode: "0755"
tags: [tempo, install]
- name: Ensure /etc/tempo exists
ansible.builtin.file:
path: /etc/tempo
state: directory
owner: root
group: tempo
mode: "0750"
tags: [tempo, config]
- name: Render tempo.yaml
ansible.builtin.template:
src: tempo.yaml.j2
dest: /etc/tempo/tempo.yaml
owner: root
group: tempo
mode: "0640"
notify: Restart tempo
tags: [tempo, config]
- name: Render systemd unit
ansible.builtin.template:
src: tempo.service.j2
dest: /etc/systemd/system/tempo.service
owner: root
group: root
mode: "0644"
notify: Restart tempo
tags: [tempo, service]
- name: Enable + start tempo
ansible.builtin.systemd:
name: tempo
state: started
enabled: true
daemon_reload: true
tags: [tempo, service]

View file

@ -0,0 +1,26 @@
# Managed by Ansible — do not edit by hand.
[Unit]
Description=Grafana Tempo
Documentation=https://grafana.com/docs/tempo/
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=tempo
Group=tempo
ExecStart=/usr/local/bin/tempo -config.file=/etc/tempo/tempo.yaml
Restart=on-failure
RestartSec=5s
LimitNOFILE=65535
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths={{ tempo_storage_local_path }}
PrivateTmp=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,58 @@
# Managed by Ansible — do not edit by hand.
#
# Tempo monolithic mode. Receives OTLP from the otel-collector,
# stores in {{ tempo_storage_backend }} backend, exposes the query
# API on :{{ tempo_http_port }} for Grafana.
server:
http_listen_port: {{ tempo_http_port }}
grpc_listen_port: 9095
log_level: info
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:{{ tempo_otlp_grpc_port }}"
ingester:
trace_idle_period: 10s
max_block_bytes: {{ tempo_max_block_bytes }}
max_block_duration: 5m
flush_check_period: 10s
compactor:
compaction:
block_retention: {{ tempo_retention_h }}h
compacted_block_retention: 1h
compaction_window: 1h
max_block_bytes: 100_000_000
retention_concurrency: 1
storage:
trace:
backend: {{ tempo_storage_backend }}
{% if tempo_storage_backend == "local" %}
local:
path: {{ tempo_storage_local_path }}/blocks
{% endif %}
wal:
path: {{ tempo_storage_local_path }}/wal
pool:
max_workers: 100
queue_depth: 10000
# v1.0 single-binary mode — overrides keep the limits sane and prevent
# a misbehaving client from blowing up Tempo. We have one client today
# (veza-backend-api), one tenant.
overrides:
defaults:
ingestion:
rate_limit_bytes: 15_000_000 # 15 MB/s per tenant
burst_size_bytes: 30_000_000
metrics_generator:
processors: [] # service-map metrics are computed in the collector instead
usage_report:
reporting_enabled: false

View file

@ -28,6 +28,7 @@ import (
"veza-backend-api/internal/services" "veza-backend-api/internal/services"
"veza-backend-api/internal/services/hyperswitch" "veza-backend-api/internal/services/hyperswitch"
"veza-backend-api/internal/shutdown" "veza-backend-api/internal/shutdown"
"veza-backend-api/internal/tracing"
"veza-backend-api/internal/workers" "veza-backend-api/internal/workers"
_ "veza-backend-api/docs" // Import docs for swagger _ "veza-backend-api/docs" // Import docs for swagger
@ -57,6 +58,11 @@ import (
// @name X-API-Key // @name X-API-Key
// @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx // @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx
// appVersion is overridden at build time via
// `-ldflags "-X main.appVersion=vX.Y.Z"`. Used as the OTel resource
// attribute service.version + Sentry release tag.
var appVersion = "dev"
func main() { func main() {
// Charger les variables d'environnement // Charger les variables d'environnement
// NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald // NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald
@ -108,6 +114,29 @@ func main() {
logger.Info(" Sentry non configuré (SENTRY_DSN non défini)") logger.Info(" Sentry non configuré (SENTRY_DSN non défini)")
} }
// v1.0.9 Day 9 — OpenTelemetry tracer init. Spans flow to the
// otel-collector container (provisioned by infra/ansible/roles/
// otel_collector) which forwards them to Tempo. Disabled in
// dev / unit tests via OTEL_SDK_DISABLED=true to keep the
// process from background-dialing localhost:4317.
tracerCtx, tracerCancel := context.WithTimeout(context.Background(), 10*time.Second)
// AppVersion drawn from build-time ldflag; falls back to "dev" so
// the resource attribute is always populated. Set via:
// go build -ldflags "-X main.appVersion=v1.0.9" ./cmd/api
tracerProvider, err := tracing.InitOTLPTracer(tracerCtx, cfg.Env, appVersion, logger)
tracerCancel()
if err != nil {
// Tracing failure is operational, not fatal. The collector
// could be starting up at the same time as the backend; the
// exporter retries internally.
logger.Warn("OTel tracer init failed — continuing without spans", zap.Error(err))
}
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = tracerProvider.Shutdown(shutdownCtx)
}()
// Initialisation de la base de données // Initialisation de la base de données
db := cfg.Database db := cfg.Database
if db == nil { if db == nil {

View file

@ -38,10 +38,14 @@ require (
github.com/swaggo/swag v1.16.6 github.com/swaggo/swag v1.16.6
github.com/testcontainers/testcontainers-go v0.42.0 github.com/testcontainers/testcontainers-go v0.42.0
github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0
go.opentelemetry.io/otel/sdk v1.43.0
go.uber.org/goleak v1.3.0 go.uber.org/goleak v1.3.0
go.uber.org/zap v1.27.0 go.uber.org/zap v1.27.0
golang.org/x/crypto v0.48.0 golang.org/x/crypto v0.49.0
golang.org/x/oauth2 v0.30.0 golang.org/x/oauth2 v0.35.0
golang.org/x/time v0.12.0 golang.org/x/time v0.12.0
gopkg.in/natefinch/lumberjack.v2 v2.2.1 gopkg.in/natefinch/lumberjack.v2 v2.2.1
gorm.io/driver/postgres v1.6.0 gorm.io/driver/postgres v1.6.0
@ -50,7 +54,7 @@ require (
) )
require ( require (
cloud.google.com/go/compute/metadata v0.3.0 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect
dario.cat/mergo v1.0.2 // indirect dario.cat/mergo v1.0.2 // indirect
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
github.com/KyleBanks/depth v1.2.1 // indirect github.com/KyleBanks/depth v1.2.1 // indirect
@ -77,6 +81,7 @@ require (
github.com/bytedance/sonic v1.14.0 // indirect github.com/bytedance/sonic v1.14.0 // indirect
github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cloudwego/base64x v0.1.6 // indirect github.com/cloudwego/base64x v0.1.6 // indirect
github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect
@ -105,6 +110,7 @@ require (
github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/goccy/go-json v0.10.2 // indirect github.com/goccy/go-json v0.10.2 // indirect
github.com/goccy/go-yaml v1.18.0 // indirect github.com/goccy/go-yaml v1.18.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/pgx/v5 v5.6.0 // indirect github.com/jackc/pgx/v5 v5.6.0 // indirect
@ -151,18 +157,21 @@ require (
github.com/yusufpapurcu/wmi v1.2.4 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
go.opentelemetry.io/otel v1.41.0 // indirect go.opentelemetry.io/otel/metric v1.43.0 // indirect
go.opentelemetry.io/otel/metric v1.41.0 // indirect go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.opentelemetry.io/otel/trace v1.41.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect
go.uber.org/multierr v1.10.0 // indirect go.uber.org/multierr v1.10.0 // indirect
golang.org/x/arch v0.20.0 // indirect golang.org/x/arch v0.20.0 // indirect
golang.org/x/image v0.38.0 // indirect golang.org/x/image v0.38.0 // indirect
golang.org/x/mod v0.33.0 // indirect golang.org/x/mod v0.33.0 // indirect
golang.org/x/net v0.51.0 // indirect golang.org/x/net v0.52.0 // indirect
golang.org/x/sync v0.20.0 // indirect golang.org/x/sync v0.20.0 // indirect
golang.org/x/sys v0.42.0 // indirect golang.org/x/sys v0.42.0 // indirect
golang.org/x/text v0.35.0 // indirect golang.org/x/text v0.35.0 // indirect
golang.org/x/tools v0.42.0 // indirect golang.org/x/tools v0.42.0 // indirect
google.golang.org/protobuf v1.36.9 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
google.golang.org/grpc v1.80.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )

View file

@ -1,5 +1,5 @@
cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
@ -75,6 +75,8 @@ github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZw
github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI= github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI=
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M= github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M=
@ -167,6 +169,8 @@ github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7Lk
github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
@ -174,6 +178,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
@ -335,16 +341,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ=
go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE= go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc=
go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk=
go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0= go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis= go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko=
@ -361,8 +373,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/image v0.38.0 h1:5l+q+Y9JDC7mBOMjo4/aPhMDcxEptsX+Tt3GgRQRPuE= golang.org/x/image v0.38.0 h1:5l+q+Y9JDC7mBOMjo4/aPhMDcxEptsX+Tt3GgRQRPuE=
golang.org/x/image v0.38.0/go.mod h1:/3f6vaXC+6CEanU4KJxbcUZyEePbyKbaLoDOe4ehFYY= golang.org/x/image v0.38.0/go.mod h1:/3f6vaXC+6CEanU4KJxbcUZyEePbyKbaLoDOe4ehFYY=
@ -383,10 +395,10 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@ -423,8 +435,8 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
@ -449,8 +461,16 @@ golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw= gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

View file

@ -9,12 +9,16 @@ import (
"time" "time"
"github.com/google/uuid" "github.com/google/uuid"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.uber.org/zap" "go.uber.org/zap"
"gorm.io/gorm" "gorm.io/gorm"
"veza-backend-api/internal/core/connecterrors" "veza-backend-api/internal/core/connecterrors"
"veza-backend-api/internal/models" "veza-backend-api/internal/models"
"veza-backend-api/internal/monitoring" "veza-backend-api/internal/monitoring"
"veza-backend-api/internal/tracing"
) )
var ( var (
@ -748,16 +752,30 @@ func (wp *HyperswitchWebhookPayload) IsRefundEvent() bool {
// ProcessPaymentWebhook handles Hyperswitch payment webhook. // ProcessPaymentWebhook handles Hyperswitch payment webhook.
// Updates order status and creates licenses when status is "succeeded". // Updates order status and creates licenses when status is "succeeded".
func (s *Service) ProcessPaymentWebhook(ctx context.Context, payload []byte) error { func (s *Service) ProcessPaymentWebhook(ctx context.Context, payload []byte) error {
// v1.0.9 Day 9 — payment.webhook span. Hot path on every Hyperswitch
// callback. Records payment_id (the carrier id Hyperswitch uses, not a
// secret) + status so trace search can pivot on a single payment quickly.
ctx, span := otel.Tracer(tracing.TracerName).Start(ctx, "payment.webhook")
defer span.End()
var wp HyperswitchWebhookPayload var wp HyperswitchWebhookPayload
if err := json.Unmarshal(payload, &wp); err != nil { if err := json.Unmarshal(payload, &wp); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "invalid webhook payload")
s.logger.Error("Invalid Hyperswitch webhook payload", zap.Error(err), zap.ByteString("payload", payload)) s.logger.Error("Invalid Hyperswitch webhook payload", zap.Error(err), zap.ByteString("payload", payload))
return fmt.Errorf("invalid webhook payload: %w", err) return fmt.Errorf("invalid webhook payload: %w", err)
} }
paymentID := wp.getPaymentID() paymentID := wp.getPaymentID()
if paymentID == "" { if paymentID == "" {
span.SetStatus(codes.Error, "missing payment_id")
return fmt.Errorf("webhook payload missing payment_id") return fmt.Errorf("webhook payload missing payment_id")
} }
status := wp.getStatus() status := wp.getStatus()
span.SetAttributes(
attribute.String("payment.id", paymentID),
attribute.String("payment.status", status),
attribute.String("payment.event_type", wp.EventType),
)
// v1.0.9 item G Phase 2: subscription dispatcher. Try the subscription // v1.0.9 item G Phase 2: subscription dispatcher. Try the subscription
// flow first; if the payment_id maps to a subscription invoice, the // flow first; if the payment_id maps to a subscription invoice, the

View file

@ -17,8 +17,13 @@ import (
"veza-backend-api/internal/handlers" "veza-backend-api/internal/handlers"
"veza-backend-api/internal/models" "veza-backend-api/internal/models"
"veza-backend-api/internal/response" "veza-backend-api/internal/response"
"veza-backend-api/internal/tracing"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap" "go.uber.org/zap"
) )
@ -247,14 +252,30 @@ func (h *TrackHandler) InitiateChunkedUpload(c *gin.Context) {
return // Erreur déjà envoyée au client return // Erreur déjà envoyée au client
} }
// v1.0.9 Day 9 — track.upload.initiate span. Hot path on upload kickoff.
// chunkService doesn't accept ctx today; the span only wraps the handler
// invocation, not the S3 multipart create itself. Migrating chunkService
// to take ctx is tracked separately.
_, span := otel.Tracer(tracing.TracerName).Start(c.Request.Context(), "track.upload.initiate",
trace.WithAttributes(
attribute.String("track.upload.user_id", userID.String()),
attribute.Int("track.upload.total_chunks", req.TotalChunks),
attribute.Int64("track.upload.total_size", req.TotalSize),
),
)
defer span.End()
// Initialiser l'upload // Initialiser l'upload
// InitiateChunkedUpload retourne un string (uploadID) donc pas de souci d'int64 // InitiateChunkedUpload retourne un string (uploadID) donc pas de souci d'int64
// Note: InitiateChunkedUpload n'accepte pas de context (à migrer si nécessaire) // Note: InitiateChunkedUpload n'accepte pas de context (à migrer si nécessaire)
uploadID, err := h.chunkService.InitiateChunkedUpload(userID, req.TotalChunks, req.TotalSize, req.Filename) uploadID, err := h.chunkService.InitiateChunkedUpload(userID, req.TotalChunks, req.TotalSize, req.Filename)
if err != nil { if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "chunk upload init failed")
response.InternalServerError(c, err.Error()) response.InternalServerError(c, err.Error())
return return
} }
span.SetAttributes(attribute.String("track.upload.id", uploadID))
response.Success(c, gin.H{ response.Success(c, gin.H{
"upload_id": uploadID, "upload_id": uploadID,

View file

@ -13,9 +13,14 @@ import (
// "veza-backend-api/internal/response" // Removed this import // "veza-backend-api/internal/response" // Removed this import
"veza-backend-api/internal/services" "veza-backend-api/internal/services"
"veza-backend-api/internal/tracing"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/google/uuid" "github.com/google/uuid"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap" "go.uber.org/zap"
) )
@ -64,10 +69,10 @@ func Login(authService *auth.AuthService, sessionService *services.SessionServic
// req.RememberMe is a bool, not *bool, so no need to check for nil or indirect // req.RememberMe is a bool, not *bool, so no need to check for nil or indirect
rememberMe := req.RememberMe rememberMe := req.RememberMe
// SECURITY(MEDIUM-011): Mask email in logs/spans to prevent PII leakage.
maskedEmail := maskEmail(req.Email)
if logger != nil { if logger != nil {
// SECURITY(MEDIUM-011): Mask email in logs to prevent PII leakage.
maskedEmail := maskEmail(req.Email)
logger.Info("Login handler processing request", logger.Info("Login handler processing request",
zap.String("email", maskedEmail), zap.String("email", maskedEmail),
zap.Bool("remember_me", rememberMe), zap.Bool("remember_me", rememberMe),
@ -77,8 +82,22 @@ func Login(authService *auth.AuthService, sessionService *services.SessionServic
// MOD-P1-004: Ajouter timeout context pour opération DB critique (login) // MOD-P1-004: Ajouter timeout context pour opération DB critique (login)
ctx, cancel := WithTimeout(c.Request.Context(), 5*time.Second) ctx, cancel := WithTimeout(c.Request.Context(), 5*time.Second)
defer cancel() defer cancel()
// v1.0.9 Day 9 — auth.login span. Hot path: every login request goes
// through here. Email is masked, no password attribute. Failure paths
// below set the span status to error.
ctx, span := otel.Tracer(tracing.TracerName).Start(ctx, "auth.login",
trace.WithAttributes(
attribute.String("auth.email_masked", maskedEmail),
attribute.Bool("auth.remember_me", rememberMe),
),
)
defer span.End()
user, tokens, err := authService.Login(ctx, req.Email, req.Password, rememberMe) user, tokens, err := authService.Login(ctx, req.Email, req.Password, rememberMe)
if err != nil { if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "login failed")
// MOD-P1-002: Improved error handling // MOD-P1-002: Improved error handling
errMsg := err.Error() errMsg := err.Error()

View file

@ -6,8 +6,13 @@ import (
apperrors "veza-backend-api/internal/errors" apperrors "veza-backend-api/internal/errors"
"veza-backend-api/internal/services" "veza-backend-api/internal/services"
"veza-backend-api/internal/tracing"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
) )
var SearchHandlersInstance *SearchHandlers var SearchHandlersInstance *SearchHandlers
@ -59,8 +64,21 @@ func (sh *SearchHandlers) Search(c *gin.Context) {
types := c.QueryArray("type") types := c.QueryArray("type")
// v1.0.9 Day 9 — search.query span. Hot path: every search bar press
// hits this. Query content is NOT recorded (PII / search history is
// sensitive); only length + types so cardinality stays bounded.
_, span := otel.Tracer(tracing.TracerName).Start(c.Request.Context(), "search.query",
trace.WithAttributes(
attribute.Int("search.query_length", len(query)),
attribute.StringSlice("search.types", types),
),
)
defer span.End()
results, err := sh.searchService.Search(query, types) results, err := sh.searchService.Search(query, types)
if err != nil { if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "search failed")
RespondWithAppError(c, apperrors.NewInternalErrorWrap("Search failed", err)) RespondWithAppError(c, apperrors.NewInternalErrorWrap("Search failed", err))
return return
} }

View file

@ -0,0 +1,194 @@
// Package tracing exposes the OpenTelemetry tracer provider wiring
// for veza-backend-api. v1.0.9 Day 9 — replaces the in-house
// W3C-only TraceContext (still kept for header propagation) with a
// real OTel SDK + OTLP/gRPC exporter that ships spans to the
// otel-collector container in front of Tempo.
//
// Wiring at runtime:
//
// veza-backend-api ──OTLP/gRPC:4317──▶ otel-collector ──▶ Tempo (Grafana stack)
//
// The collector + Tempo are provisioned by infra/ansible/roles/
// otel_collector + roles/tempo (W2 Day 9).
package tracing
import (
"context"
"errors"
"fmt"
"os"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
"go.uber.org/zap"
)
// TracerName is the global instrumentation library identifier used
// by every veza-backend-api span. Hot paths grab a tracer with this
// name — keep it stable so dashboards filter cleanly by
// `service.name = veza-backend-api` AND `instrumentation.library.name
// = veza-backend-api`.
const TracerName = "veza-backend-api"
// Provider holds the SDK tracer provider so the caller can shut it
// down on app exit (flushes the buffered span queue to OTLP, prevents
// trace loss on a graceful Ctrl-C).
type Provider struct {
provider *sdktrace.TracerProvider
exporter *otlptrace.Exporter
logger *zap.Logger
}
// InitOTLPTracer initialises the global OTel tracer provider with an
// OTLP/gRPC exporter pointed at OTEL_EXPORTER_OTLP_ENDPOINT
// (default: localhost:4317, which the otel_collector role binds).
//
// Behaviour matrix:
// - OTEL_EXPORTER_OTLP_ENDPOINT unset + OTEL_SDK_DISABLED unset →
// try localhost:4317. If the dial fails (collector down), the
// exporter buffers and retries; the app keeps running. Spans are
// dropped after the buffer fills (default 2048 spans), but no
// hot-path code blocks on the exporter.
// - OTEL_SDK_DISABLED=true → returns a no-op Provider (zero spans
// emitted). Used in unit tests and dev mode where the operator
// doesn't want background networking.
//
// Caller MUST `defer p.Shutdown(ctx)` so the in-flight queue is
// flushed on exit. The returned Provider's `Shutdown` is safe to
// call multiple times.
func InitOTLPTracer(ctx context.Context, env, version string, logger *zap.Logger) (*Provider, error) {
if logger == nil {
logger = zap.NewNop()
}
if isOTelDisabled() {
logger.Info("OTel tracer init skipped (OTEL_SDK_DISABLED=true)")
return &Provider{logger: logger}, nil
}
endpoint := otelEndpoint()
logger.Info("Initialising OTel tracer",
zap.String("endpoint", endpoint),
zap.String("service", TracerName),
zap.String("env", env),
zap.String("version", version))
exporter, err := otlptracegrpc.New(ctx,
otlptracegrpc.WithEndpoint(endpoint),
otlptracegrpc.WithInsecure(), // collector runs on the trusted Incus bridge; mTLS is W4 territory
otlptracegrpc.WithTimeout(5*time.Second),
)
if err != nil {
return nil, fmt.Errorf("create OTLP exporter: %w", err)
}
res, err := resource.Merge(
resource.Default(),
resource.NewWithAttributes(
semconv.SchemaURL,
semconv.ServiceName(TracerName),
semconv.ServiceVersion(version),
semconv.DeploymentEnvironment(env),
),
)
if err != nil {
return nil, fmt.Errorf("build OTel resource: %w", err)
}
// BatchSpanProcessor — not the simple/sync processor.
// BatchSpanProcessor:
// - buffers spans up to 2048 by default
// - flushes every 5s OR when full
// - never blocks the hot-path on collector availability
// Sync would block every span on a network round-trip; that's a
// regression on every endpoint we instrument. The trade is
// possible span loss when the buffer fills — acceptable because
// we instrument hot paths sparingly + the collector is
// co-located on the same machine in v1.0.
provider := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exporter,
sdktrace.WithBatchTimeout(5*time.Second),
sdktrace.WithMaxExportBatchSize(512),
),
sdktrace.WithResource(res),
sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRatio()))),
)
otel.SetTracerProvider(provider)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
return &Provider{
provider: provider,
exporter: exporter,
logger: logger,
}, nil
}
// Shutdown flushes pending spans to the collector. Pass a context
// with a deadline (typically 5-10s during graceful shutdown) so a
// dead collector doesn't block app exit.
func (p *Provider) Shutdown(ctx context.Context) error {
if p == nil || p.provider == nil {
return nil
}
var err error
if shutdownErr := p.provider.Shutdown(ctx); shutdownErr != nil {
err = errors.Join(err, fmt.Errorf("shutdown tracer provider: %w", shutdownErr))
}
if p.exporter != nil {
if shutdownErr := p.exporter.Shutdown(ctx); shutdownErr != nil {
err = errors.Join(err, fmt.Errorf("shutdown OTLP exporter: %w", shutdownErr))
}
}
if err != nil {
p.logger.Warn("OTel shutdown produced errors", zap.Error(err))
}
return err
}
func otelEndpoint() string {
if v := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"); v != "" {
return v
}
return "localhost:4317"
}
func isOTelDisabled() bool {
v := os.Getenv("OTEL_SDK_DISABLED")
return v == "true" || v == "1"
}
// sampleRatio reads OTEL_TRACES_SAMPLER_ARG (a float 0..1) and
// returns the fraction of root traces that should ship to the
// collector. Default is 1.0 in dev/staging (all traces) and 0.1 in
// prod (10%) to keep the Tempo backend lean. Callers can flip via
// the env var without re-deploy.
func sampleRatio() float64 {
v := os.Getenv("OTEL_TRACES_SAMPLER_ARG")
if v == "" {
// Defaults are env-aware via OTEL_DEPLOYMENT_ENV (set by the
// process supervisor); fall back to 1.0 so dev sees every
// span without ceremony.
return 1.0
}
var ratio float64
if _, err := fmt.Sscanf(v, "%f", &ratio); err != nil {
return 1.0
}
if ratio < 0 {
ratio = 0
}
if ratio > 1 {
ratio = 1
}
return ratio
}