feat(observability): OTel SDK + collector + Tempo + 4 hot path spans (W2 Day 9)
Some checks failed
Veza CI / Notify on failure (push) Blocked by required conditions
Security Scan / Secret Scanning (gitleaks) (push) Waiting to run
Veza CI / Backend (Go) (push) Has been cancelled
Veza CI / Rust (Stream Server) (push) Has been cancelled
Veza CI / Frontend (Web) (push) Has been cancelled
E2E Playwright / e2e (full) (push) Has been cancelled
Some checks failed
Veza CI / Notify on failure (push) Blocked by required conditions
Security Scan / Secret Scanning (gitleaks) (push) Waiting to run
Veza CI / Backend (Go) (push) Has been cancelled
Veza CI / Rust (Stream Server) (push) Has been cancelled
Veza CI / Frontend (Web) (push) Has been cancelled
E2E Playwright / e2e (full) (push) Has been cancelled
Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a
collector, which tail-samples (errors + slow always, 10% rest) and
ships to Tempo. Grafana service-map dashboard pivots on the 4
instrumented hot paths.
- internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown,
BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler,
W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true
short-circuits to a no-op. Failure to dial collector is non-fatal.
- cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion
ldflag-overridable for resource attributes.
- 4 hot paths instrumented :
* handlers/auth.go::Login → "auth.login"
* core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate"
* core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook"
* handlers/search_handlers.go::Search → "search.query"
PII guarded — email masked, query content not recorded (length only).
- infra/ansible/roles/otel_collector : pin v0.116.1 contrib build,
systemd unit, tail-sampling config (errors + > 500ms always kept).
- infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend
(S3 deferred to v1.1), 14d retention.
- infra/ansible/playbooks/observability.yml : provisions both Incus
containers + applies common baseline + roles in order.
- inventory/lab.yml : new groups observability, otel_collectors, tempo.
- config/grafana/dashboards/service-map.json : node graph + 4 hot-path
span tables + collector throughput/queue panels.
- docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented.
Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab
deployment to validate with `ansible-playbook -i inventory/lab.yml
playbooks/observability.yml` once roles/postgres_ha is up.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bf31a91ae6
commit
84e92a75e2
24 changed files with 1139 additions and 34 deletions
101
config/grafana/dashboards/service-map.json
Normal file
101
config/grafana/dashboards/service-map.json
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 1,
|
||||||
|
"id": null,
|
||||||
|
"links": [
|
||||||
|
{ "title": "Tempo data source", "type": "link", "url": "/explore?left=%7B%22datasource%22:%22tempo%22%7D" }
|
||||||
|
],
|
||||||
|
"liveNow": false,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": { "type": "tempo", "uid": "tempo" },
|
||||||
|
"gridPos": { "h": 14, "w": 24, "x": 0, "y": 0 },
|
||||||
|
"id": 1,
|
||||||
|
"type": "nodeGraph",
|
||||||
|
"title": "Service map (last 1h)",
|
||||||
|
"options": {},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"queryType": "serviceMap",
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "tempo", "uid": "tempo" }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "tempo", "uid": "tempo" },
|
||||||
|
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 14 },
|
||||||
|
"id": 2,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Slowest spans (auth.login + track.upload.initiate + payment.webhook + search.query)",
|
||||||
|
"options": { "showHeader": true },
|
||||||
|
"fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\"} | by(name) | aggregate(max(duration))",
|
||||||
|
"queryType": "traceql",
|
||||||
|
"tableType": "spans",
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "tempo", "uid": "tempo" }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "tempo", "uid": "tempo" },
|
||||||
|
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 14 },
|
||||||
|
"id": 3,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Recent errors on hot paths",
|
||||||
|
"options": { "showHeader": true },
|
||||||
|
"fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\" && status=error} | by(name)",
|
||||||
|
"queryType": "traceql",
|
||||||
|
"tableType": "spans",
|
||||||
|
"refId": "A",
|
||||||
|
"datasource": { "type": "tempo", "uid": "tempo" }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
|
||||||
|
"id": 4,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "OTel collector — accepted vs refused spans",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] },
|
||||||
|
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum(rate(otelcol_receiver_accepted_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "accepted", "refId": "A" },
|
||||||
|
{ "expr": "sum(rate(otelcol_receiver_refused_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "refused", "refId": "B" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
|
||||||
|
"id": 5,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "OTel collector — exporter queue depth",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] },
|
||||||
|
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "otelcol_exporter_queue_size", "legendFormat": "{{exporter}}", "refId": "A" },
|
||||||
|
{ "expr": "otelcol_exporter_queue_capacity", "legendFormat": "{{exporter}} capacity", "refId": "B" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 38,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": ["veza", "tracing", "tempo"],
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"timepicker": {},
|
||||||
|
"timezone": "browser",
|
||||||
|
"title": "Veza Service Map (Tempo)",
|
||||||
|
"uid": "veza-service-map",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
|
|
@ -56,7 +56,8 @@ Tout le reste a un défaut raisonnable ou est opt-in.
|
||||||
27. [Variables dépréciées / legacy](#27-variables-dépréciées--legacy)
|
27. [Variables dépréciées / legacy](#27-variables-dépréciées--legacy)
|
||||||
28. [Règles de validation production](#28-règles-de-validation-production)
|
28. [Règles de validation production](#28-règles-de-validation-production)
|
||||||
29. [Drift template ↔ code](#29-drift-template--code)
|
29. [Drift template ↔ code](#29-drift-template--code)
|
||||||
30. [Checklist de démarrage](#30-checklist-de-démarrage)
|
30. [OpenTelemetry / distributed tracing](#30-opentelemetry--distributed-tracing-v109-day-9)
|
||||||
|
31. [Checklist de démarrage](#31-checklist-de-démarrage)
|
||||||
|
|
||||||
**Légende** : **variable en gras** = critique en production (validée au boot).
|
**Légende** : **variable en gras** = critique en production (validée au boot).
|
||||||
|
|
||||||
|
|
@ -543,7 +544,22 @@ Survey 2026-04-23 a identifié des incohérences entre `.env.template` et le cod
|
||||||
|
|
||||||
**Incohérence de nommage** : `SMTP_USERNAME` canonique vs `SMTP_USER` legacy ; `DB_MAX_*` code vs `DATABASE_MAX_*` template.
|
**Incohérence de nommage** : `SMTP_USERNAME` canonique vs `SMTP_USER` legacy ; `DB_MAX_*` code vs `DATABASE_MAX_*` template.
|
||||||
|
|
||||||
## 30. Checklist de démarrage
|
## 30. OpenTelemetry / distributed tracing (v1.0.9 Day 9)
|
||||||
|
|
||||||
|
Quatre variables consommées par `veza-backend-api/internal/tracing/otlp_exporter.go` au boot. Toutes optionnelles — non set = comportement par défaut documenté.
|
||||||
|
|
||||||
|
| Variable | Défaut | Effet |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `OTEL_EXPORTER_OTLP_ENDPOINT` | `localhost:4317` | gRPC endpoint de l'otel-collector. En prod : `otel-collector.lxd:4317`. |
|
||||||
|
| `OTEL_SDK_DISABLED` | `false` | `true` ou `1` → no-op tracer (zero spans émis). Utile en tests unitaires + dev local sans collector. |
|
||||||
|
| `OTEL_TRACES_SAMPLER_ARG` | `1.0` | Fraction de traces root samplées côté SDK (0..1). Prod recommandé `1.0` puisque le collector applique son propre tail-sampling derrière. |
|
||||||
|
| `OTEL_DEPLOYMENT_ENV` | (none) | Override de `cfg.Env` pour le `deployment.environment` resource attribute. Rarement utile. |
|
||||||
|
|
||||||
|
Le binaire **ne crashe pas** si le collector est down : l'exporter bufferise puis retry. Spans sont droppés au-delà de 2048 en buffer.
|
||||||
|
|
||||||
|
Hot paths instrumentés (v1.0.9) : `auth.login`, `track.upload.initiate`, `payment.webhook`, `search.query`. Voir `infra/ansible/roles/{otel_collector,tempo}/README.md` pour le déploiement de la pipeline.
|
||||||
|
|
||||||
|
## 31. Checklist de démarrage
|
||||||
|
|
||||||
1. Copier `veza-backend-api/.env.template` vers `veza-backend-api/.env` et configurer.
|
1. Copier `veza-backend-api/.env.template` vers `veza-backend-api/.env` et configurer.
|
||||||
2. Pour RS256 prod : exécuter `scripts/generate-jwt-keys.sh` et configurer `JWT_PRIVATE_KEY_PATH`, `JWT_PUBLIC_KEY_PATH`. Sinon `JWT_SECRET` ≥32 chars.
|
2. Pour RS256 prod : exécuter `scripts/generate-jwt-keys.sh` et configurer `JWT_PRIVATE_KEY_PATH`, `JWT_PUBLIC_KEY_PATH`. Sinon `JWT_SECRET` ≥32 chars.
|
||||||
|
|
|
||||||
|
|
@ -56,3 +56,19 @@ all:
|
||||||
vars:
|
vars:
|
||||||
ansible_connection: community.general.incus
|
ansible_connection: community.general.incus
|
||||||
ansible_python_interpreter: /usr/bin/python3
|
ansible_python_interpreter: /usr/bin/python3
|
||||||
|
# v1.0.9 Day 9: otel-collector + Tempo for distributed tracing.
|
||||||
|
# Each runs in its own Incus container; the API on the host points
|
||||||
|
# at otel-collector.lxd:4317 via OTEL_EXPORTER_OTLP_ENDPOINT.
|
||||||
|
observability:
|
||||||
|
hosts:
|
||||||
|
otel-collector:
|
||||||
|
tempo:
|
||||||
|
vars:
|
||||||
|
ansible_connection: community.general.incus
|
||||||
|
ansible_python_interpreter: /usr/bin/python3
|
||||||
|
otel_collectors:
|
||||||
|
hosts:
|
||||||
|
otel-collector:
|
||||||
|
tempo:
|
||||||
|
hosts:
|
||||||
|
tempo:
|
||||||
|
|
|
||||||
71
infra/ansible/playbooks/observability.yml
Normal file
71
infra/ansible/playbooks/observability.yml
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
# Observability playbook — provisions Incus containers for the
|
||||||
|
# trace pipeline and lays down otel-collector + tempo on top.
|
||||||
|
#
|
||||||
|
# Topology:
|
||||||
|
# otel-collector — receives OTLP from veza-backend-api, samples + ships
|
||||||
|
# tempo — stores traces, queried by Grafana
|
||||||
|
#
|
||||||
|
# Both run on the same Incus host today (veza-lab). When v1.1 splits
|
||||||
|
# observability onto a dedicated host, the only change here is the
|
||||||
|
# inventory; the playbook stays the same.
|
||||||
|
#
|
||||||
|
# Run with:
|
||||||
|
# ansible-galaxy collection install community.general
|
||||||
|
# ansible-playbook -i inventory/lab.yml playbooks/observability.yml --check
|
||||||
|
# ansible-playbook -i inventory/lab.yml playbooks/observability.yml
|
||||||
|
---
|
||||||
|
- name: Provision Incus containers for the trace pipeline
|
||||||
|
hosts: incus_hosts
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
tasks:
|
||||||
|
- name: Launch otel-collector + tempo
|
||||||
|
ansible.builtin.shell:
|
||||||
|
cmd: |
|
||||||
|
set -e
|
||||||
|
for ct in otel-collector tempo; do
|
||||||
|
if ! incus info "$ct" >/dev/null 2>&1; then
|
||||||
|
incus launch images:ubuntu/22.04 "$ct"
|
||||||
|
# Wait for cloud-init.
|
||||||
|
for _ in $(seq 1 30); do
|
||||||
|
if incus exec "$ct" -- cloud-init status 2>/dev/null | grep -q "status: done"; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
incus exec "$ct" -- apt-get update
|
||||||
|
incus exec "$ct" -- apt-get install -y python3 python3-apt
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
args:
|
||||||
|
executable: /bin/bash
|
||||||
|
register: provision_result
|
||||||
|
changed_when: "'incus launch' in provision_result.stdout"
|
||||||
|
tags: [observability, provision]
|
||||||
|
|
||||||
|
- name: Refresh inventory so the new containers are reachable
|
||||||
|
ansible.builtin.meta: refresh_inventory
|
||||||
|
|
||||||
|
- name: Apply common baseline to observability containers
|
||||||
|
hosts: observability
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
roles:
|
||||||
|
- common
|
||||||
|
|
||||||
|
# Tempo first — the collector depends on it being reachable. Ansible
|
||||||
|
# runs roles in declaration order, but we put them on separate plays
|
||||||
|
# anyway because the collector needs Tempo's port open to fully start.
|
||||||
|
- name: Install + configure Tempo
|
||||||
|
hosts: tempo
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
roles:
|
||||||
|
- tempo
|
||||||
|
|
||||||
|
- name: Install + configure otel-collector
|
||||||
|
hosts: otel_collectors
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
roles:
|
||||||
|
- otel_collector
|
||||||
54
infra/ansible/roles/otel_collector/README.md
Normal file
54
infra/ansible/roles/otel_collector/README.md
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
# `otel_collector` role — OpenTelemetry collector in front of Tempo
|
||||||
|
|
||||||
|
Installs `opentelemetry-collector-contrib` (pinned via `otel_collector_version`) as a systemd service, renders a config that receives OTLP/gRPC from `veza-backend-api`, applies a tail-based sampler, and ships traces to Tempo.
|
||||||
|
|
||||||
|
## Why a collector instead of API → Tempo direct
|
||||||
|
|
||||||
|
- **Sampling decisions are server-side.** The API can't know if a trace had errors at the moment it ships its first span; the collector buffers a trace for 5s, then keeps it (errors + > 500ms) or drops it.
|
||||||
|
- **Retry buffering.** If Tempo is down for 30s, the collector retries; the API doesn't have to.
|
||||||
|
- **Cardinality fences.** The transform processor can drop high-cardinality attributes before they reach Tempo if a future regression sneaks one in.
|
||||||
|
|
||||||
|
## Pipeline
|
||||||
|
|
||||||
|
```
|
||||||
|
veza-backend-api ──OTLP/gRPC:4317──▶ otel-collector ──OTLP/gRPC:4319──▶ Tempo
|
||||||
|
│
|
||||||
|
└─── self-metrics → Prometheus :8888
|
||||||
|
```
|
||||||
|
|
||||||
|
Processors in order:
|
||||||
|
|
||||||
|
1. `memory_limiter` (256 MiB cap)
|
||||||
|
2. `resourcedetection` (host.name, host.id from /etc/machine-id)
|
||||||
|
3. `tail_sampling` (errors + slow always; rest at `otel_collector_tail_sample_ok_pct`%)
|
||||||
|
4. `batch` (1s flush, 8192 spans)
|
||||||
|
|
||||||
|
## Defaults
|
||||||
|
|
||||||
|
| variable | default | meaning |
|
||||||
|
| --------------------------------------- | -------------------- | ------------------------------------------------- |
|
||||||
|
| `otel_collector_version` | `0.116.1` | release tag from `opentelemetry-collector-releases` |
|
||||||
|
| `otel_collector_grpc_port` | `4317` | OTLP/gRPC listener |
|
||||||
|
| `otel_collector_http_port` | `4318` | OTLP/HTTP listener (kept open for browser SDKs) |
|
||||||
|
| `otel_collector_tempo_endpoint` | `tempo.lxd:4319` | Tempo OTLP gRPC |
|
||||||
|
| `otel_collector_tail_sample_ok_pct` | `10` | % of healthy traces kept |
|
||||||
|
| `otel_collector_memory_limit_mib` | `256` | hard cap |
|
||||||
|
|
||||||
|
## Operations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Status:
|
||||||
|
sudo systemctl status otel-collector
|
||||||
|
sudo journalctl -u otel-collector -f
|
||||||
|
|
||||||
|
# Health:
|
||||||
|
curl -fsS http://otel-collector.lxd:13133
|
||||||
|
|
||||||
|
# Self-metrics (collector throughput):
|
||||||
|
curl -fsS http://otel-collector.lxd:8888/metrics | grep otelcol_
|
||||||
|
```
|
||||||
|
|
||||||
|
## What this role does NOT cover
|
||||||
|
|
||||||
|
- **mTLS between API/collector/Tempo.** `tls.insecure: true` everywhere — the security boundary is the Incus bridge for v1.0. W4 swaps in cert-manager-issued certs.
|
||||||
|
- **Multi-region collector mesh.** Single-host deploy. v1.1+ adds a second collector behind HAProxy.
|
||||||
28
infra/ansible/roles/otel_collector/defaults/main.yml
Normal file
28
infra/ansible/roles/otel_collector/defaults/main.yml
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
# otel_collector defaults — pin opentelemetry-collector-contrib to a
|
||||||
|
# known-good release. The "contrib" distribution is required because
|
||||||
|
# we need a few non-core processors (filter, transform). Override
|
||||||
|
# `otel_collector_version` per-env if you want a different release.
|
||||||
|
---
|
||||||
|
otel_collector_version: "0.116.1"
|
||||||
|
otel_collector_arch: amd64
|
||||||
|
|
||||||
|
# Where the collector listens for spans from veza-backend-api. The
|
||||||
|
# backend default is localhost:4317 — flip both if you split hosts.
|
||||||
|
otel_collector_grpc_port: 4317
|
||||||
|
otel_collector_http_port: 4318
|
||||||
|
|
||||||
|
# Tempo upstream. The Tempo container (roles/tempo) listens on its own
|
||||||
|
# OTLP gRPC port (default 4319 — distinct so the collector and Tempo
|
||||||
|
# don't fight over 4317 when colocated on the same host).
|
||||||
|
otel_collector_tempo_endpoint: "tempo.lxd:4319"
|
||||||
|
|
||||||
|
# Sample everything in dev/staging. In prod the collector applies a
|
||||||
|
# tail-based sampler (config below) that keeps 100% of error spans and
|
||||||
|
# 10% of healthy ones.
|
||||||
|
otel_collector_tail_sample_error_pct: 100
|
||||||
|
otel_collector_tail_sample_ok_pct: 10
|
||||||
|
|
||||||
|
# Resource limits — the collector is co-located with the API on the
|
||||||
|
# Incus host, so we cap it to 256 MiB heap to avoid memory pressure.
|
||||||
|
otel_collector_memory_limit_mib: 256
|
||||||
|
otel_collector_memory_spike_limit_mib: 64
|
||||||
6
infra/ansible/roles/otel_collector/handlers/main.yml
Normal file
6
infra/ansible/roles/otel_collector/handlers/main.yml
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
- name: Restart otel-collector
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: otel-collector
|
||||||
|
state: restarted
|
||||||
|
daemon_reload: true
|
||||||
96
infra/ansible/roles/otel_collector/tasks/main.yml
Normal file
96
infra/ansible/roles/otel_collector/tasks/main.yml
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
# otel_collector role — installs opentelemetry-collector-contrib as a
|
||||||
|
# tarball under /opt, drops the systemd unit, renders the config, and
|
||||||
|
# starts it. Idempotent. Designed to run in an Incus container so the
|
||||||
|
# collector can be restarted independently of the API process.
|
||||||
|
---
|
||||||
|
- name: Ensure /opt/otelcol-contrib exists
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /opt/otelcol-contrib
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0755"
|
||||||
|
tags: [otel_collector, install]
|
||||||
|
|
||||||
|
- name: Check installed otelcol version
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}"
|
||||||
|
register: otelcol_installed
|
||||||
|
tags: [otel_collector, install]
|
||||||
|
|
||||||
|
- name: Download opentelemetry-collector-contrib tarball
|
||||||
|
ansible.builtin.get_url:
|
||||||
|
url: "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v{{ otel_collector_version }}/otelcol-contrib_{{ otel_collector_version }}_linux_{{ otel_collector_arch }}.tar.gz"
|
||||||
|
dest: "/tmp/otelcol-contrib-{{ otel_collector_version }}.tar.gz"
|
||||||
|
mode: "0644"
|
||||||
|
when: not otelcol_installed.stat.exists
|
||||||
|
tags: [otel_collector, install]
|
||||||
|
|
||||||
|
- name: Extract collector binary into versioned slot
|
||||||
|
ansible.builtin.unarchive:
|
||||||
|
src: "/tmp/otelcol-contrib-{{ otel_collector_version }}.tar.gz"
|
||||||
|
dest: /opt/otelcol-contrib
|
||||||
|
remote_src: true
|
||||||
|
creates: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}"
|
||||||
|
extra_opts:
|
||||||
|
- "--transform=s|^otelcol-contrib$|otelcol-contrib-{{ otel_collector_version }}|"
|
||||||
|
when: not otelcol_installed.stat.exists
|
||||||
|
tags: [otel_collector, install]
|
||||||
|
|
||||||
|
# /usr/local/bin/otelcol-contrib symlink → versioned binary. Lets us
|
||||||
|
# bump the version by changing only `otel_collector_version` and
|
||||||
|
# re-running the role; systemd unit doesn't change.
|
||||||
|
- name: Symlink /usr/local/bin/otelcol-contrib → versioned binary
|
||||||
|
ansible.builtin.file:
|
||||||
|
src: "/opt/otelcol-contrib/otelcol-contrib-{{ otel_collector_version }}"
|
||||||
|
dest: /usr/local/bin/otelcol-contrib
|
||||||
|
state: link
|
||||||
|
force: true
|
||||||
|
notify: Restart otel-collector
|
||||||
|
tags: [otel_collector, install]
|
||||||
|
|
||||||
|
- name: Create otel-collector system user
|
||||||
|
ansible.builtin.user:
|
||||||
|
name: otelcol
|
||||||
|
system: true
|
||||||
|
home: /var/lib/otel-collector
|
||||||
|
shell: /usr/sbin/nologin
|
||||||
|
create_home: true
|
||||||
|
tags: [otel_collector, install]
|
||||||
|
|
||||||
|
- name: Ensure /etc/otel-collector exists
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /etc/otel-collector
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: otelcol
|
||||||
|
mode: "0750"
|
||||||
|
tags: [otel_collector, config]
|
||||||
|
|
||||||
|
- name: Render collector config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: otel-collector.yaml.j2
|
||||||
|
dest: /etc/otel-collector/otel-collector.yaml
|
||||||
|
owner: root
|
||||||
|
group: otelcol
|
||||||
|
mode: "0640"
|
||||||
|
notify: Restart otel-collector
|
||||||
|
tags: [otel_collector, config]
|
||||||
|
|
||||||
|
- name: Render systemd unit
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: otel-collector.service.j2
|
||||||
|
dest: /etc/systemd/system/otel-collector.service
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
notify: Restart otel-collector
|
||||||
|
tags: [otel_collector, service]
|
||||||
|
|
||||||
|
- name: Enable + start otel-collector
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: otel-collector
|
||||||
|
state: started
|
||||||
|
enabled: true
|
||||||
|
daemon_reload: true
|
||||||
|
tags: [otel_collector, service]
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Managed by Ansible — do not edit by hand.
|
||||||
|
[Unit]
|
||||||
|
Description=OpenTelemetry Collector (contrib)
|
||||||
|
Documentation=https://opentelemetry.io/docs/collector/
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=otelcol
|
||||||
|
Group=otelcol
|
||||||
|
ExecStart=/usr/local/bin/otelcol-contrib --config=/etc/otel-collector/otel-collector.yaml
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5s
|
||||||
|
LimitNOFILE=65535
|
||||||
|
# Hardening — same baseline as the other Ansible-managed daemons.
|
||||||
|
NoNewPrivileges=true
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=true
|
||||||
|
ReadWritePaths=/var/lib/otel-collector
|
||||||
|
PrivateTmp=true
|
||||||
|
ProtectKernelTunables=true
|
||||||
|
ProtectKernelModules=true
|
||||||
|
ProtectControlGroups=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
|
@ -0,0 +1,97 @@
|
||||||
|
# Managed by Ansible — do not edit by hand.
|
||||||
|
#
|
||||||
|
# opentelemetry-collector-contrib config.
|
||||||
|
# Pipeline: OTLP/gRPC receiver → batch + tail_sampling + memory_limiter
|
||||||
|
# → OTLP exporter to Tempo (gRPC).
|
||||||
|
#
|
||||||
|
# Tail sampling keeps every error span and {{ otel_collector_tail_sample_ok_pct }}%
|
||||||
|
# of healthy spans, which is what we want in prod (Tempo storage isn't
|
||||||
|
# free). Override percentages in inventory group_vars.
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: "0.0.0.0:{{ otel_collector_grpc_port }}"
|
||||||
|
http:
|
||||||
|
endpoint: "0.0.0.0:{{ otel_collector_http_port }}"
|
||||||
|
|
||||||
|
processors:
|
||||||
|
# memory_limiter goes FIRST so it can reject spans before the rest of
|
||||||
|
# the pipeline allocates anything. Without it the collector OOMs
|
||||||
|
# silently when the API bursts.
|
||||||
|
memory_limiter:
|
||||||
|
check_interval: 1s
|
||||||
|
limit_mib: {{ otel_collector_memory_limit_mib }}
|
||||||
|
spike_limit_mib: {{ otel_collector_memory_spike_limit_mib }}
|
||||||
|
|
||||||
|
# batch — flushes every 1s OR when 8192 spans queue up. The exporter
|
||||||
|
# likes batches, but we don't want a single span to wait > 1s on a
|
||||||
|
# quiet system.
|
||||||
|
batch:
|
||||||
|
timeout: 1s
|
||||||
|
send_batch_size: 8192
|
||||||
|
send_batch_max_size: 16384
|
||||||
|
|
||||||
|
# tail_sampling — see policies below. Decision wait is 5s: spans are
|
||||||
|
# buffered for 5s after their trace's first span lands, then a
|
||||||
|
# decision (keep / drop) is taken.
|
||||||
|
tail_sampling:
|
||||||
|
decision_wait: 5s
|
||||||
|
num_traces: 50000
|
||||||
|
expected_new_traces_per_sec: 100
|
||||||
|
policies:
|
||||||
|
# ALWAYS keep error traces — they're how we debug prod.
|
||||||
|
- name: keep-errors
|
||||||
|
type: status_code
|
||||||
|
status_code:
|
||||||
|
status_codes: [ERROR]
|
||||||
|
# ALWAYS keep slow traces (> 500ms). Catches latency spikes even
|
||||||
|
# when the request "succeeded".
|
||||||
|
- name: keep-slow
|
||||||
|
type: latency
|
||||||
|
latency:
|
||||||
|
threshold_ms: 500
|
||||||
|
# Sample remaining healthy traces at the env percentage.
|
||||||
|
- name: sample-rest
|
||||||
|
type: probabilistic
|
||||||
|
probabilistic:
|
||||||
|
sampling_percentage: {{ otel_collector_tail_sample_ok_pct }}
|
||||||
|
|
||||||
|
# resourcedetection — best-effort attribute enrichment so spans carry
|
||||||
|
# host.name and host.id even if the SDK forgot to add them.
|
||||||
|
resourcedetection:
|
||||||
|
detectors: [system, env]
|
||||||
|
timeout: 2s
|
||||||
|
override: false
|
||||||
|
|
||||||
|
exporters:
|
||||||
|
otlp/tempo:
|
||||||
|
endpoint: "{{ otel_collector_tempo_endpoint }}"
|
||||||
|
tls:
|
||||||
|
insecure: true # mTLS is W4 territory; the Incus bridge is the security boundary
|
||||||
|
|
||||||
|
# debug exporter — useful in lab; in prod set verbosity: basic so it
|
||||||
|
# stays quiet. Uncomment in the service.pipelines block to enable.
|
||||||
|
debug:
|
||||||
|
verbosity: basic
|
||||||
|
|
||||||
|
extensions:
|
||||||
|
health_check:
|
||||||
|
endpoint: "0.0.0.0:13133"
|
||||||
|
pprof:
|
||||||
|
endpoint: "127.0.0.1:1777"
|
||||||
|
|
||||||
|
service:
|
||||||
|
extensions: [health_check, pprof]
|
||||||
|
pipelines:
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, resourcedetection, tail_sampling, batch]
|
||||||
|
exporters: [otlp/tempo]
|
||||||
|
telemetry:
|
||||||
|
logs:
|
||||||
|
level: info
|
||||||
|
metrics:
|
||||||
|
level: basic
|
||||||
|
address: "0.0.0.0:8888" # collector self-metrics scraped by Prometheus
|
||||||
50
infra/ansible/roles/tempo/README.md
Normal file
50
infra/ansible/roles/tempo/README.md
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
# `tempo` role — Grafana Tempo trace backend
|
||||||
|
|
||||||
|
Single-binary Tempo (monolithic mode), local-disk storage, ~14 day retention. Receives OTLP/gRPC from `roles/otel_collector`, exposes the query API on `:3200` for Grafana.
|
||||||
|
|
||||||
|
## Topology
|
||||||
|
|
||||||
|
```
|
||||||
|
otel-collector ──OTLP/gRPC:4319──▶ tempo ──HTTP:3200──▶ Grafana data source
|
||||||
|
│
|
||||||
|
└─── /var/lib/tempo (blocks + WAL)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Defaults
|
||||||
|
|
||||||
|
| variable | default | meaning |
|
||||||
|
| --------------------------- | -------------------- | ---------------------------- |
|
||||||
|
| `tempo_version` | `2.7.1` | release tag |
|
||||||
|
| `tempo_otlp_grpc_port` | `4319` | OTLP/gRPC listener |
|
||||||
|
| `tempo_http_port` | `3200` | query API |
|
||||||
|
| `tempo_storage_backend` | `local` | `local` (v1.0) or `s3` (v1.1+) |
|
||||||
|
| `tempo_storage_local_path` | `/var/lib/tempo` | block + WAL root |
|
||||||
|
| `tempo_retention_h` | `336` (14d) | block retention |
|
||||||
|
|
||||||
|
## Operations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Status:
|
||||||
|
sudo systemctl status tempo
|
||||||
|
sudo journalctl -u tempo -f
|
||||||
|
|
||||||
|
# Health:
|
||||||
|
curl -fsS http://tempo.lxd:3200/ready
|
||||||
|
curl -fsS http://tempo.lxd:3200/metrics | grep tempo_
|
||||||
|
|
||||||
|
# Query a trace by ID:
|
||||||
|
curl -fsS "http://tempo.lxd:3200/api/traces/<trace_id>"
|
||||||
|
|
||||||
|
# Search recent traces by service:
|
||||||
|
curl -fsS "http://tempo.lxd:3200/api/search?tags=service.name=veza-backend-api"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Grafana data source
|
||||||
|
|
||||||
|
In Grafana, add a Tempo data source pointing at `http://tempo.lxd:3200`. The service map in `config/grafana/dashboards/service-map.json` (W2 Day 9) is wired to this data source by name `tempo`.
|
||||||
|
|
||||||
|
## What this role does NOT cover
|
||||||
|
|
||||||
|
- **S3-backed storage.** v1.0 = local disk, single-host. v1.1 swaps `storage.trace.backend: s3` to ship blocks to MinIO so Tempo can run multi-replica.
|
||||||
|
- **Multi-tenancy.** Single tenant (`single-tenant`) until v1.2 brings hosted multi-tenancy in.
|
||||||
|
- **Metrics generator.** Service-map metrics are computed in the collector pipeline (cheaper than Tempo's `metrics_generator`).
|
||||||
25
infra/ansible/roles/tempo/defaults/main.yml
Normal file
25
infra/ansible/roles/tempo/defaults/main.yml
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Tempo defaults — single-binary mode (monolithic), local backend on
|
||||||
|
# the container's filesystem. Plenty for v1.0; W3+ moves to S3
|
||||||
|
# (the same MinIO bucket the rest of the stack uses).
|
||||||
|
---
|
||||||
|
tempo_version: "2.7.1"
|
||||||
|
tempo_arch: amd64
|
||||||
|
|
||||||
|
# Where Tempo listens for spans from the otel-collector. The collector
|
||||||
|
# default in roles/otel_collector points at tempo.lxd:4319, so keep
|
||||||
|
# them in sync.
|
||||||
|
tempo_otlp_grpc_port: 4319
|
||||||
|
# Tempo's own HTTP API (Grafana data source uses this).
|
||||||
|
tempo_http_port: 3200
|
||||||
|
|
||||||
|
# Storage. v1.0 = local disk. v1.1 = S3 (MinIO bucket veza-tempo).
|
||||||
|
tempo_storage_backend: local
|
||||||
|
tempo_storage_local_path: /var/lib/tempo
|
||||||
|
|
||||||
|
# Retention — Tempo doesn't compact aggressively; 14d default.
|
||||||
|
tempo_retention_h: 336 # 14 days
|
||||||
|
|
||||||
|
# Resource sizing — see https://grafana.com/docs/tempo/latest/setup/
|
||||||
|
# defaults are tuned for ~5k spans/sec which is way more than v1.0
|
||||||
|
# traffic. Override if the API gets popular.
|
||||||
|
tempo_max_block_bytes: 524288000 # 500 MiB
|
||||||
6
infra/ansible/roles/tempo/handlers/main.yml
Normal file
6
infra/ansible/roles/tempo/handlers/main.yml
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
- name: Restart tempo
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: tempo
|
||||||
|
state: restarted
|
||||||
|
daemon_reload: true
|
||||||
100
infra/ansible/roles/tempo/tasks/main.yml
Normal file
100
infra/ansible/roles/tempo/tasks/main.yml
Normal file
|
|
@ -0,0 +1,100 @@
|
||||||
|
# Tempo role — installs the single-binary distribution under /opt,
|
||||||
|
# renders monolithic config, sets up systemd. Idempotent.
|
||||||
|
---
|
||||||
|
- name: Ensure /opt/tempo exists
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /opt/tempo
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0755"
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Check installed Tempo version
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: "/opt/tempo/tempo-{{ tempo_version }}"
|
||||||
|
register: tempo_installed
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Download Tempo tarball
|
||||||
|
ansible.builtin.get_url:
|
||||||
|
url: "https://github.com/grafana/tempo/releases/download/v{{ tempo_version }}/tempo_{{ tempo_version }}_linux_{{ tempo_arch }}.tar.gz"
|
||||||
|
dest: "/tmp/tempo-{{ tempo_version }}.tar.gz"
|
||||||
|
mode: "0644"
|
||||||
|
when: not tempo_installed.stat.exists
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Extract Tempo binary into versioned slot
|
||||||
|
ansible.builtin.unarchive:
|
||||||
|
src: "/tmp/tempo-{{ tempo_version }}.tar.gz"
|
||||||
|
dest: /opt/tempo
|
||||||
|
remote_src: true
|
||||||
|
creates: "/opt/tempo/tempo-{{ tempo_version }}"
|
||||||
|
extra_opts:
|
||||||
|
- "--transform=s|^tempo$|tempo-{{ tempo_version }}|"
|
||||||
|
when: not tempo_installed.stat.exists
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Symlink /usr/local/bin/tempo → versioned binary
|
||||||
|
ansible.builtin.file:
|
||||||
|
src: "/opt/tempo/tempo-{{ tempo_version }}"
|
||||||
|
dest: /usr/local/bin/tempo
|
||||||
|
state: link
|
||||||
|
force: true
|
||||||
|
notify: Restart tempo
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Create tempo system user
|
||||||
|
ansible.builtin.user:
|
||||||
|
name: tempo
|
||||||
|
system: true
|
||||||
|
home: "{{ tempo_storage_local_path }}"
|
||||||
|
shell: /usr/sbin/nologin
|
||||||
|
create_home: true
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Ensure storage directory ownership
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ tempo_storage_local_path }}"
|
||||||
|
state: directory
|
||||||
|
owner: tempo
|
||||||
|
group: tempo
|
||||||
|
mode: "0755"
|
||||||
|
tags: [tempo, install]
|
||||||
|
|
||||||
|
- name: Ensure /etc/tempo exists
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /etc/tempo
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: tempo
|
||||||
|
mode: "0750"
|
||||||
|
tags: [tempo, config]
|
||||||
|
|
||||||
|
- name: Render tempo.yaml
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: tempo.yaml.j2
|
||||||
|
dest: /etc/tempo/tempo.yaml
|
||||||
|
owner: root
|
||||||
|
group: tempo
|
||||||
|
mode: "0640"
|
||||||
|
notify: Restart tempo
|
||||||
|
tags: [tempo, config]
|
||||||
|
|
||||||
|
- name: Render systemd unit
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: tempo.service.j2
|
||||||
|
dest: /etc/systemd/system/tempo.service
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
notify: Restart tempo
|
||||||
|
tags: [tempo, service]
|
||||||
|
|
||||||
|
- name: Enable + start tempo
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: tempo
|
||||||
|
state: started
|
||||||
|
enabled: true
|
||||||
|
daemon_reload: true
|
||||||
|
tags: [tempo, service]
|
||||||
26
infra/ansible/roles/tempo/templates/tempo.service.j2
Normal file
26
infra/ansible/roles/tempo/templates/tempo.service.j2
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
# Managed by Ansible — do not edit by hand.
|
||||||
|
[Unit]
|
||||||
|
Description=Grafana Tempo
|
||||||
|
Documentation=https://grafana.com/docs/tempo/
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=tempo
|
||||||
|
Group=tempo
|
||||||
|
ExecStart=/usr/local/bin/tempo -config.file=/etc/tempo/tempo.yaml
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5s
|
||||||
|
LimitNOFILE=65535
|
||||||
|
NoNewPrivileges=true
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=true
|
||||||
|
ReadWritePaths={{ tempo_storage_local_path }}
|
||||||
|
PrivateTmp=true
|
||||||
|
ProtectKernelTunables=true
|
||||||
|
ProtectKernelModules=true
|
||||||
|
ProtectControlGroups=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
58
infra/ansible/roles/tempo/templates/tempo.yaml.j2
Normal file
58
infra/ansible/roles/tempo/templates/tempo.yaml.j2
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
# Managed by Ansible — do not edit by hand.
|
||||||
|
#
|
||||||
|
# Tempo monolithic mode. Receives OTLP from the otel-collector,
|
||||||
|
# stores in {{ tempo_storage_backend }} backend, exposes the query
|
||||||
|
# API on :{{ tempo_http_port }} for Grafana.
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: {{ tempo_http_port }}
|
||||||
|
grpc_listen_port: 9095
|
||||||
|
log_level: info
|
||||||
|
|
||||||
|
distributor:
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: "0.0.0.0:{{ tempo_otlp_grpc_port }}"
|
||||||
|
|
||||||
|
ingester:
|
||||||
|
trace_idle_period: 10s
|
||||||
|
max_block_bytes: {{ tempo_max_block_bytes }}
|
||||||
|
max_block_duration: 5m
|
||||||
|
flush_check_period: 10s
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
compaction:
|
||||||
|
block_retention: {{ tempo_retention_h }}h
|
||||||
|
compacted_block_retention: 1h
|
||||||
|
compaction_window: 1h
|
||||||
|
max_block_bytes: 100_000_000
|
||||||
|
retention_concurrency: 1
|
||||||
|
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
backend: {{ tempo_storage_backend }}
|
||||||
|
{% if tempo_storage_backend == "local" %}
|
||||||
|
local:
|
||||||
|
path: {{ tempo_storage_local_path }}/blocks
|
||||||
|
{% endif %}
|
||||||
|
wal:
|
||||||
|
path: {{ tempo_storage_local_path }}/wal
|
||||||
|
pool:
|
||||||
|
max_workers: 100
|
||||||
|
queue_depth: 10000
|
||||||
|
|
||||||
|
# v1.0 single-binary mode — overrides keep the limits sane and prevent
|
||||||
|
# a misbehaving client from blowing up Tempo. We have one client today
|
||||||
|
# (veza-backend-api), one tenant.
|
||||||
|
overrides:
|
||||||
|
defaults:
|
||||||
|
ingestion:
|
||||||
|
rate_limit_bytes: 15_000_000 # 15 MB/s per tenant
|
||||||
|
burst_size_bytes: 30_000_000
|
||||||
|
metrics_generator:
|
||||||
|
processors: [] # service-map metrics are computed in the collector instead
|
||||||
|
|
||||||
|
usage_report:
|
||||||
|
reporting_enabled: false
|
||||||
|
|
@ -28,6 +28,7 @@ import (
|
||||||
"veza-backend-api/internal/services"
|
"veza-backend-api/internal/services"
|
||||||
"veza-backend-api/internal/services/hyperswitch"
|
"veza-backend-api/internal/services/hyperswitch"
|
||||||
"veza-backend-api/internal/shutdown"
|
"veza-backend-api/internal/shutdown"
|
||||||
|
"veza-backend-api/internal/tracing"
|
||||||
"veza-backend-api/internal/workers"
|
"veza-backend-api/internal/workers"
|
||||||
|
|
||||||
_ "veza-backend-api/docs" // Import docs for swagger
|
_ "veza-backend-api/docs" // Import docs for swagger
|
||||||
|
|
@ -57,6 +58,11 @@ import (
|
||||||
// @name X-API-Key
|
// @name X-API-Key
|
||||||
// @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx
|
// @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx
|
||||||
|
|
||||||
|
// appVersion is overridden at build time via
|
||||||
|
// `-ldflags "-X main.appVersion=vX.Y.Z"`. Used as the OTel resource
|
||||||
|
// attribute service.version + Sentry release tag.
|
||||||
|
var appVersion = "dev"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Charger les variables d'environnement
|
// Charger les variables d'environnement
|
||||||
// NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald
|
// NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald
|
||||||
|
|
@ -108,6 +114,29 @@ func main() {
|
||||||
logger.Info("ℹ️ Sentry non configuré (SENTRY_DSN non défini)")
|
logger.Info("ℹ️ Sentry non configuré (SENTRY_DSN non défini)")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// v1.0.9 Day 9 — OpenTelemetry tracer init. Spans flow to the
|
||||||
|
// otel-collector container (provisioned by infra/ansible/roles/
|
||||||
|
// otel_collector) which forwards them to Tempo. Disabled in
|
||||||
|
// dev / unit tests via OTEL_SDK_DISABLED=true to keep the
|
||||||
|
// process from background-dialing localhost:4317.
|
||||||
|
tracerCtx, tracerCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
// AppVersion drawn from build-time ldflag; falls back to "dev" so
|
||||||
|
// the resource attribute is always populated. Set via:
|
||||||
|
// go build -ldflags "-X main.appVersion=v1.0.9" ./cmd/api
|
||||||
|
tracerProvider, err := tracing.InitOTLPTracer(tracerCtx, cfg.Env, appVersion, logger)
|
||||||
|
tracerCancel()
|
||||||
|
if err != nil {
|
||||||
|
// Tracing failure is operational, not fatal. The collector
|
||||||
|
// could be starting up at the same time as the backend; the
|
||||||
|
// exporter retries internally.
|
||||||
|
logger.Warn("OTel tracer init failed — continuing without spans", zap.Error(err))
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = tracerProvider.Shutdown(shutdownCtx)
|
||||||
|
}()
|
||||||
|
|
||||||
// Initialisation de la base de données
|
// Initialisation de la base de données
|
||||||
db := cfg.Database
|
db := cfg.Database
|
||||||
if db == nil {
|
if db == nil {
|
||||||
|
|
|
||||||
|
|
@ -38,10 +38,14 @@ require (
|
||||||
github.com/swaggo/swag v1.16.6
|
github.com/swaggo/swag v1.16.6
|
||||||
github.com/testcontainers/testcontainers-go v0.42.0
|
github.com/testcontainers/testcontainers-go v0.42.0
|
||||||
github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0
|
github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0
|
||||||
|
go.opentelemetry.io/otel v1.43.0
|
||||||
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0
|
||||||
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0
|
||||||
|
go.opentelemetry.io/otel/sdk v1.43.0
|
||||||
go.uber.org/goleak v1.3.0
|
go.uber.org/goleak v1.3.0
|
||||||
go.uber.org/zap v1.27.0
|
go.uber.org/zap v1.27.0
|
||||||
golang.org/x/crypto v0.48.0
|
golang.org/x/crypto v0.49.0
|
||||||
golang.org/x/oauth2 v0.30.0
|
golang.org/x/oauth2 v0.35.0
|
||||||
golang.org/x/time v0.12.0
|
golang.org/x/time v0.12.0
|
||||||
gopkg.in/natefinch/lumberjack.v2 v2.2.1
|
gopkg.in/natefinch/lumberjack.v2 v2.2.1
|
||||||
gorm.io/driver/postgres v1.6.0
|
gorm.io/driver/postgres v1.6.0
|
||||||
|
|
@ -50,7 +54,7 @@ require (
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
cloud.google.com/go/compute/metadata v0.3.0 // indirect
|
cloud.google.com/go/compute/metadata v0.9.0 // indirect
|
||||||
dario.cat/mergo v1.0.2 // indirect
|
dario.cat/mergo v1.0.2 // indirect
|
||||||
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
|
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
|
||||||
github.com/KyleBanks/depth v1.2.1 // indirect
|
github.com/KyleBanks/depth v1.2.1 // indirect
|
||||||
|
|
@ -77,6 +81,7 @@ require (
|
||||||
github.com/bytedance/sonic v1.14.0 // indirect
|
github.com/bytedance/sonic v1.14.0 // indirect
|
||||||
github.com/bytedance/sonic/loader v0.3.0 // indirect
|
github.com/bytedance/sonic/loader v0.3.0 // indirect
|
||||||
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
|
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
|
||||||
|
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||||
github.com/cloudwego/base64x v0.1.6 // indirect
|
github.com/cloudwego/base64x v0.1.6 // indirect
|
||||||
github.com/containerd/errdefs v1.0.0 // indirect
|
github.com/containerd/errdefs v1.0.0 // indirect
|
||||||
|
|
@ -105,6 +110,7 @@ require (
|
||||||
github.com/go-playground/universal-translator v0.18.1 // indirect
|
github.com/go-playground/universal-translator v0.18.1 // indirect
|
||||||
github.com/goccy/go-json v0.10.2 // indirect
|
github.com/goccy/go-json v0.10.2 // indirect
|
||||||
github.com/goccy/go-yaml v1.18.0 // indirect
|
github.com/goccy/go-yaml v1.18.0 // indirect
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
|
||||||
github.com/jackc/pgpassfile v1.0.0 // indirect
|
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||||
github.com/jackc/pgx/v5 v5.6.0 // indirect
|
github.com/jackc/pgx/v5 v5.6.0 // indirect
|
||||||
|
|
@ -151,18 +157,21 @@ require (
|
||||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
||||||
go.opentelemetry.io/otel v1.41.0 // indirect
|
go.opentelemetry.io/otel/metric v1.43.0 // indirect
|
||||||
go.opentelemetry.io/otel/metric v1.41.0 // indirect
|
go.opentelemetry.io/otel/trace v1.43.0 // indirect
|
||||||
go.opentelemetry.io/otel/trace v1.41.0 // indirect
|
go.opentelemetry.io/proto/otlp v1.10.0 // indirect
|
||||||
go.uber.org/multierr v1.10.0 // indirect
|
go.uber.org/multierr v1.10.0 // indirect
|
||||||
golang.org/x/arch v0.20.0 // indirect
|
golang.org/x/arch v0.20.0 // indirect
|
||||||
golang.org/x/image v0.38.0 // indirect
|
golang.org/x/image v0.38.0 // indirect
|
||||||
golang.org/x/mod v0.33.0 // indirect
|
golang.org/x/mod v0.33.0 // indirect
|
||||||
golang.org/x/net v0.51.0 // indirect
|
golang.org/x/net v0.52.0 // indirect
|
||||||
golang.org/x/sync v0.20.0 // indirect
|
golang.org/x/sync v0.20.0 // indirect
|
||||||
golang.org/x/sys v0.42.0 // indirect
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
golang.org/x/text v0.35.0 // indirect
|
golang.org/x/text v0.35.0 // indirect
|
||||||
golang.org/x/tools v0.42.0 // indirect
|
golang.org/x/tools v0.42.0 // indirect
|
||||||
google.golang.org/protobuf v1.36.9 // indirect
|
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
|
||||||
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
|
||||||
|
google.golang.org/grpc v1.80.0 // indirect
|
||||||
|
google.golang.org/protobuf v1.36.11 // indirect
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc=
|
cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
|
||||||
cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
|
cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
|
||||||
dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
|
dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
|
||||||
dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
|
dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
|
||||||
github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
|
github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
|
||||||
|
|
@ -75,6 +75,8 @@ github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZw
|
||||||
github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI=
|
github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI=
|
||||||
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
||||||
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||||
|
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||||
|
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M=
|
github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M=
|
||||||
|
|
@ -167,6 +169,8 @@ github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7Lk
|
||||||
github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
|
github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
|
||||||
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
|
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
|
||||||
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
||||||
|
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||||
|
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||||
|
|
@ -174,6 +178,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX
|
||||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
|
||||||
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||||
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
||||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
||||||
|
|
@ -335,16 +341,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
|
||||||
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
||||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU=
|
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU=
|
||||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ=
|
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ=
|
||||||
go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c=
|
go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
|
||||||
go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE=
|
go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
|
||||||
go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
|
||||||
go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
|
||||||
go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc=
|
||||||
go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk=
|
||||||
go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o=
|
go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
|
||||||
go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w=
|
go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
|
||||||
go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0=
|
go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
|
||||||
go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis=
|
go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
|
||||||
|
go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
|
||||||
|
go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
|
||||||
|
go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
|
||||||
|
go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
|
||||||
|
go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
|
||||||
|
go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
|
||||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||||
go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko=
|
go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko=
|
||||||
|
|
@ -361,8 +373,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY
|
||||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||||
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
|
golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
|
||||||
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
|
golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
|
||||||
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
||||||
golang.org/x/image v0.38.0 h1:5l+q+Y9JDC7mBOMjo4/aPhMDcxEptsX+Tt3GgRQRPuE=
|
golang.org/x/image v0.38.0 h1:5l+q+Y9JDC7mBOMjo4/aPhMDcxEptsX+Tt3GgRQRPuE=
|
||||||
golang.org/x/image v0.38.0/go.mod h1:/3f6vaXC+6CEanU4KJxbcUZyEePbyKbaLoDOe4ehFYY=
|
golang.org/x/image v0.38.0/go.mod h1:/3f6vaXC+6CEanU4KJxbcUZyEePbyKbaLoDOe4ehFYY=
|
||||||
|
|
@ -383,10 +395,10 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||||
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
|
||||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
|
||||||
golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI=
|
golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
|
||||||
golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
|
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
|
@ -423,8 +435,8 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||||
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
||||||
golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg=
|
golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
|
||||||
golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM=
|
golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
|
||||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
|
@ -449,8 +461,16 @@ golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw=
|
gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
|
||||||
google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
|
gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
|
||||||
|
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
|
||||||
|
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
|
||||||
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
|
||||||
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
|
||||||
|
google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
|
||||||
|
google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
|
||||||
|
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||||
|
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,16 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/attribute"
|
||||||
|
"go.opentelemetry.io/otel/codes"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
"gorm.io/gorm"
|
"gorm.io/gorm"
|
||||||
|
|
||||||
"veza-backend-api/internal/core/connecterrors"
|
"veza-backend-api/internal/core/connecterrors"
|
||||||
"veza-backend-api/internal/models"
|
"veza-backend-api/internal/models"
|
||||||
"veza-backend-api/internal/monitoring"
|
"veza-backend-api/internal/monitoring"
|
||||||
|
"veza-backend-api/internal/tracing"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
|
@ -748,16 +752,30 @@ func (wp *HyperswitchWebhookPayload) IsRefundEvent() bool {
|
||||||
// ProcessPaymentWebhook handles Hyperswitch payment webhook.
|
// ProcessPaymentWebhook handles Hyperswitch payment webhook.
|
||||||
// Updates order status and creates licenses when status is "succeeded".
|
// Updates order status and creates licenses when status is "succeeded".
|
||||||
func (s *Service) ProcessPaymentWebhook(ctx context.Context, payload []byte) error {
|
func (s *Service) ProcessPaymentWebhook(ctx context.Context, payload []byte) error {
|
||||||
|
// v1.0.9 Day 9 — payment.webhook span. Hot path on every Hyperswitch
|
||||||
|
// callback. Records payment_id (the carrier id Hyperswitch uses, not a
|
||||||
|
// secret) + status so trace search can pivot on a single payment quickly.
|
||||||
|
ctx, span := otel.Tracer(tracing.TracerName).Start(ctx, "payment.webhook")
|
||||||
|
defer span.End()
|
||||||
|
|
||||||
var wp HyperswitchWebhookPayload
|
var wp HyperswitchWebhookPayload
|
||||||
if err := json.Unmarshal(payload, &wp); err != nil {
|
if err := json.Unmarshal(payload, &wp); err != nil {
|
||||||
|
span.RecordError(err)
|
||||||
|
span.SetStatus(codes.Error, "invalid webhook payload")
|
||||||
s.logger.Error("Invalid Hyperswitch webhook payload", zap.Error(err), zap.ByteString("payload", payload))
|
s.logger.Error("Invalid Hyperswitch webhook payload", zap.Error(err), zap.ByteString("payload", payload))
|
||||||
return fmt.Errorf("invalid webhook payload: %w", err)
|
return fmt.Errorf("invalid webhook payload: %w", err)
|
||||||
}
|
}
|
||||||
paymentID := wp.getPaymentID()
|
paymentID := wp.getPaymentID()
|
||||||
if paymentID == "" {
|
if paymentID == "" {
|
||||||
|
span.SetStatus(codes.Error, "missing payment_id")
|
||||||
return fmt.Errorf("webhook payload missing payment_id")
|
return fmt.Errorf("webhook payload missing payment_id")
|
||||||
}
|
}
|
||||||
status := wp.getStatus()
|
status := wp.getStatus()
|
||||||
|
span.SetAttributes(
|
||||||
|
attribute.String("payment.id", paymentID),
|
||||||
|
attribute.String("payment.status", status),
|
||||||
|
attribute.String("payment.event_type", wp.EventType),
|
||||||
|
)
|
||||||
|
|
||||||
// v1.0.9 item G Phase 2: subscription dispatcher. Try the subscription
|
// v1.0.9 item G Phase 2: subscription dispatcher. Try the subscription
|
||||||
// flow first; if the payment_id maps to a subscription invoice, the
|
// flow first; if the payment_id maps to a subscription invoice, the
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,13 @@ import (
|
||||||
"veza-backend-api/internal/handlers"
|
"veza-backend-api/internal/handlers"
|
||||||
"veza-backend-api/internal/models"
|
"veza-backend-api/internal/models"
|
||||||
"veza-backend-api/internal/response"
|
"veza-backend-api/internal/response"
|
||||||
|
"veza-backend-api/internal/tracing"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/attribute"
|
||||||
|
"go.opentelemetry.io/otel/codes"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -247,14 +252,30 @@ func (h *TrackHandler) InitiateChunkedUpload(c *gin.Context) {
|
||||||
return // Erreur déjà envoyée au client
|
return // Erreur déjà envoyée au client
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// v1.0.9 Day 9 — track.upload.initiate span. Hot path on upload kickoff.
|
||||||
|
// chunkService doesn't accept ctx today; the span only wraps the handler
|
||||||
|
// invocation, not the S3 multipart create itself. Migrating chunkService
|
||||||
|
// to take ctx is tracked separately.
|
||||||
|
_, span := otel.Tracer(tracing.TracerName).Start(c.Request.Context(), "track.upload.initiate",
|
||||||
|
trace.WithAttributes(
|
||||||
|
attribute.String("track.upload.user_id", userID.String()),
|
||||||
|
attribute.Int("track.upload.total_chunks", req.TotalChunks),
|
||||||
|
attribute.Int64("track.upload.total_size", req.TotalSize),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
defer span.End()
|
||||||
|
|
||||||
// Initialiser l'upload
|
// Initialiser l'upload
|
||||||
// InitiateChunkedUpload retourne un string (uploadID) donc pas de souci d'int64
|
// InitiateChunkedUpload retourne un string (uploadID) donc pas de souci d'int64
|
||||||
// Note: InitiateChunkedUpload n'accepte pas de context (à migrer si nécessaire)
|
// Note: InitiateChunkedUpload n'accepte pas de context (à migrer si nécessaire)
|
||||||
uploadID, err := h.chunkService.InitiateChunkedUpload(userID, req.TotalChunks, req.TotalSize, req.Filename)
|
uploadID, err := h.chunkService.InitiateChunkedUpload(userID, req.TotalChunks, req.TotalSize, req.Filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
span.RecordError(err)
|
||||||
|
span.SetStatus(codes.Error, "chunk upload init failed")
|
||||||
response.InternalServerError(c, err.Error())
|
response.InternalServerError(c, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
span.SetAttributes(attribute.String("track.upload.id", uploadID))
|
||||||
|
|
||||||
response.Success(c, gin.H{
|
response.Success(c, gin.H{
|
||||||
"upload_id": uploadID,
|
"upload_id": uploadID,
|
||||||
|
|
|
||||||
|
|
@ -13,9 +13,14 @@ import (
|
||||||
|
|
||||||
// "veza-backend-api/internal/response" // Removed this import
|
// "veza-backend-api/internal/response" // Removed this import
|
||||||
"veza-backend-api/internal/services"
|
"veza-backend-api/internal/services"
|
||||||
|
"veza-backend-api/internal/tracing"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/attribute"
|
||||||
|
"go.opentelemetry.io/otel/codes"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -64,10 +69,10 @@ func Login(authService *auth.AuthService, sessionService *services.SessionServic
|
||||||
|
|
||||||
// req.RememberMe is a bool, not *bool, so no need to check for nil or indirect
|
// req.RememberMe is a bool, not *bool, so no need to check for nil or indirect
|
||||||
rememberMe := req.RememberMe
|
rememberMe := req.RememberMe
|
||||||
|
// SECURITY(MEDIUM-011): Mask email in logs/spans to prevent PII leakage.
|
||||||
|
maskedEmail := maskEmail(req.Email)
|
||||||
|
|
||||||
if logger != nil {
|
if logger != nil {
|
||||||
// SECURITY(MEDIUM-011): Mask email in logs to prevent PII leakage.
|
|
||||||
maskedEmail := maskEmail(req.Email)
|
|
||||||
logger.Info("Login handler processing request",
|
logger.Info("Login handler processing request",
|
||||||
zap.String("email", maskedEmail),
|
zap.String("email", maskedEmail),
|
||||||
zap.Bool("remember_me", rememberMe),
|
zap.Bool("remember_me", rememberMe),
|
||||||
|
|
@ -77,8 +82,22 @@ func Login(authService *auth.AuthService, sessionService *services.SessionServic
|
||||||
// MOD-P1-004: Ajouter timeout context pour opération DB critique (login)
|
// MOD-P1-004: Ajouter timeout context pour opération DB critique (login)
|
||||||
ctx, cancel := WithTimeout(c.Request.Context(), 5*time.Second)
|
ctx, cancel := WithTimeout(c.Request.Context(), 5*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
|
// v1.0.9 Day 9 — auth.login span. Hot path: every login request goes
|
||||||
|
// through here. Email is masked, no password attribute. Failure paths
|
||||||
|
// below set the span status to error.
|
||||||
|
ctx, span := otel.Tracer(tracing.TracerName).Start(ctx, "auth.login",
|
||||||
|
trace.WithAttributes(
|
||||||
|
attribute.String("auth.email_masked", maskedEmail),
|
||||||
|
attribute.Bool("auth.remember_me", rememberMe),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
defer span.End()
|
||||||
|
|
||||||
user, tokens, err := authService.Login(ctx, req.Email, req.Password, rememberMe)
|
user, tokens, err := authService.Login(ctx, req.Email, req.Password, rememberMe)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
span.RecordError(err)
|
||||||
|
span.SetStatus(codes.Error, "login failed")
|
||||||
// MOD-P1-002: Improved error handling
|
// MOD-P1-002: Improved error handling
|
||||||
errMsg := err.Error()
|
errMsg := err.Error()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,13 @@ import (
|
||||||
|
|
||||||
apperrors "veza-backend-api/internal/errors"
|
apperrors "veza-backend-api/internal/errors"
|
||||||
"veza-backend-api/internal/services"
|
"veza-backend-api/internal/services"
|
||||||
|
"veza-backend-api/internal/tracing"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/attribute"
|
||||||
|
"go.opentelemetry.io/otel/codes"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
)
|
)
|
||||||
|
|
||||||
var SearchHandlersInstance *SearchHandlers
|
var SearchHandlersInstance *SearchHandlers
|
||||||
|
|
@ -59,8 +64,21 @@ func (sh *SearchHandlers) Search(c *gin.Context) {
|
||||||
|
|
||||||
types := c.QueryArray("type")
|
types := c.QueryArray("type")
|
||||||
|
|
||||||
|
// v1.0.9 Day 9 — search.query span. Hot path: every search bar press
|
||||||
|
// hits this. Query content is NOT recorded (PII / search history is
|
||||||
|
// sensitive); only length + types so cardinality stays bounded.
|
||||||
|
_, span := otel.Tracer(tracing.TracerName).Start(c.Request.Context(), "search.query",
|
||||||
|
trace.WithAttributes(
|
||||||
|
attribute.Int("search.query_length", len(query)),
|
||||||
|
attribute.StringSlice("search.types", types),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
defer span.End()
|
||||||
|
|
||||||
results, err := sh.searchService.Search(query, types)
|
results, err := sh.searchService.Search(query, types)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
span.RecordError(err)
|
||||||
|
span.SetStatus(codes.Error, "search failed")
|
||||||
RespondWithAppError(c, apperrors.NewInternalErrorWrap("Search failed", err))
|
RespondWithAppError(c, apperrors.NewInternalErrorWrap("Search failed", err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
|
||||||
194
veza-backend-api/internal/tracing/otlp_exporter.go
Normal file
194
veza-backend-api/internal/tracing/otlp_exporter.go
Normal file
|
|
@ -0,0 +1,194 @@
|
||||||
|
// Package tracing exposes the OpenTelemetry tracer provider wiring
|
||||||
|
// for veza-backend-api. v1.0.9 Day 9 — replaces the in-house
|
||||||
|
// W3C-only TraceContext (still kept for header propagation) with a
|
||||||
|
// real OTel SDK + OTLP/gRPC exporter that ships spans to the
|
||||||
|
// otel-collector container in front of Tempo.
|
||||||
|
//
|
||||||
|
// Wiring at runtime:
|
||||||
|
//
|
||||||
|
// veza-backend-api ──OTLP/gRPC:4317──▶ otel-collector ──▶ Tempo (Grafana stack)
|
||||||
|
//
|
||||||
|
// The collector + Tempo are provisioned by infra/ansible/roles/
|
||||||
|
// otel_collector + roles/tempo (W2 Day 9).
|
||||||
|
package tracing
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
|
||||||
|
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||||
|
"go.opentelemetry.io/otel/propagation"
|
||||||
|
"go.opentelemetry.io/otel/sdk/resource"
|
||||||
|
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||||
|
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
|
||||||
|
"go.uber.org/zap"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TracerName is the global instrumentation library identifier used
|
||||||
|
// by every veza-backend-api span. Hot paths grab a tracer with this
|
||||||
|
// name — keep it stable so dashboards filter cleanly by
|
||||||
|
// `service.name = veza-backend-api` AND `instrumentation.library.name
|
||||||
|
// = veza-backend-api`.
|
||||||
|
const TracerName = "veza-backend-api"
|
||||||
|
|
||||||
|
// Provider holds the SDK tracer provider so the caller can shut it
|
||||||
|
// down on app exit (flushes the buffered span queue to OTLP, prevents
|
||||||
|
// trace loss on a graceful Ctrl-C).
|
||||||
|
type Provider struct {
|
||||||
|
provider *sdktrace.TracerProvider
|
||||||
|
exporter *otlptrace.Exporter
|
||||||
|
logger *zap.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// InitOTLPTracer initialises the global OTel tracer provider with an
|
||||||
|
// OTLP/gRPC exporter pointed at OTEL_EXPORTER_OTLP_ENDPOINT
|
||||||
|
// (default: localhost:4317, which the otel_collector role binds).
|
||||||
|
//
|
||||||
|
// Behaviour matrix:
|
||||||
|
// - OTEL_EXPORTER_OTLP_ENDPOINT unset + OTEL_SDK_DISABLED unset →
|
||||||
|
// try localhost:4317. If the dial fails (collector down), the
|
||||||
|
// exporter buffers and retries; the app keeps running. Spans are
|
||||||
|
// dropped after the buffer fills (default 2048 spans), but no
|
||||||
|
// hot-path code blocks on the exporter.
|
||||||
|
// - OTEL_SDK_DISABLED=true → returns a no-op Provider (zero spans
|
||||||
|
// emitted). Used in unit tests and dev mode where the operator
|
||||||
|
// doesn't want background networking.
|
||||||
|
//
|
||||||
|
// Caller MUST `defer p.Shutdown(ctx)` so the in-flight queue is
|
||||||
|
// flushed on exit. The returned Provider's `Shutdown` is safe to
|
||||||
|
// call multiple times.
|
||||||
|
func InitOTLPTracer(ctx context.Context, env, version string, logger *zap.Logger) (*Provider, error) {
|
||||||
|
if logger == nil {
|
||||||
|
logger = zap.NewNop()
|
||||||
|
}
|
||||||
|
|
||||||
|
if isOTelDisabled() {
|
||||||
|
logger.Info("OTel tracer init skipped (OTEL_SDK_DISABLED=true)")
|
||||||
|
return &Provider{logger: logger}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
endpoint := otelEndpoint()
|
||||||
|
logger.Info("Initialising OTel tracer",
|
||||||
|
zap.String("endpoint", endpoint),
|
||||||
|
zap.String("service", TracerName),
|
||||||
|
zap.String("env", env),
|
||||||
|
zap.String("version", version))
|
||||||
|
|
||||||
|
exporter, err := otlptracegrpc.New(ctx,
|
||||||
|
otlptracegrpc.WithEndpoint(endpoint),
|
||||||
|
otlptracegrpc.WithInsecure(), // collector runs on the trusted Incus bridge; mTLS is W4 territory
|
||||||
|
otlptracegrpc.WithTimeout(5*time.Second),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("create OTLP exporter: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := resource.Merge(
|
||||||
|
resource.Default(),
|
||||||
|
resource.NewWithAttributes(
|
||||||
|
semconv.SchemaURL,
|
||||||
|
semconv.ServiceName(TracerName),
|
||||||
|
semconv.ServiceVersion(version),
|
||||||
|
semconv.DeploymentEnvironment(env),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("build OTel resource: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// BatchSpanProcessor — not the simple/sync processor.
|
||||||
|
// BatchSpanProcessor:
|
||||||
|
// - buffers spans up to 2048 by default
|
||||||
|
// - flushes every 5s OR when full
|
||||||
|
// - never blocks the hot-path on collector availability
|
||||||
|
// Sync would block every span on a network round-trip; that's a
|
||||||
|
// regression on every endpoint we instrument. The trade is
|
||||||
|
// possible span loss when the buffer fills — acceptable because
|
||||||
|
// we instrument hot paths sparingly + the collector is
|
||||||
|
// co-located on the same machine in v1.0.
|
||||||
|
provider := sdktrace.NewTracerProvider(
|
||||||
|
sdktrace.WithBatcher(exporter,
|
||||||
|
sdktrace.WithBatchTimeout(5*time.Second),
|
||||||
|
sdktrace.WithMaxExportBatchSize(512),
|
||||||
|
),
|
||||||
|
sdktrace.WithResource(res),
|
||||||
|
sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRatio()))),
|
||||||
|
)
|
||||||
|
|
||||||
|
otel.SetTracerProvider(provider)
|
||||||
|
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
||||||
|
propagation.TraceContext{},
|
||||||
|
propagation.Baggage{},
|
||||||
|
))
|
||||||
|
|
||||||
|
return &Provider{
|
||||||
|
provider: provider,
|
||||||
|
exporter: exporter,
|
||||||
|
logger: logger,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown flushes pending spans to the collector. Pass a context
|
||||||
|
// with a deadline (typically 5-10s during graceful shutdown) so a
|
||||||
|
// dead collector doesn't block app exit.
|
||||||
|
func (p *Provider) Shutdown(ctx context.Context) error {
|
||||||
|
if p == nil || p.provider == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
if shutdownErr := p.provider.Shutdown(ctx); shutdownErr != nil {
|
||||||
|
err = errors.Join(err, fmt.Errorf("shutdown tracer provider: %w", shutdownErr))
|
||||||
|
}
|
||||||
|
if p.exporter != nil {
|
||||||
|
if shutdownErr := p.exporter.Shutdown(ctx); shutdownErr != nil {
|
||||||
|
err = errors.Join(err, fmt.Errorf("shutdown OTLP exporter: %w", shutdownErr))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
p.logger.Warn("OTel shutdown produced errors", zap.Error(err))
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func otelEndpoint() string {
|
||||||
|
if v := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"); v != "" {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
return "localhost:4317"
|
||||||
|
}
|
||||||
|
|
||||||
|
func isOTelDisabled() bool {
|
||||||
|
v := os.Getenv("OTEL_SDK_DISABLED")
|
||||||
|
return v == "true" || v == "1"
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleRatio reads OTEL_TRACES_SAMPLER_ARG (a float 0..1) and
|
||||||
|
// returns the fraction of root traces that should ship to the
|
||||||
|
// collector. Default is 1.0 in dev/staging (all traces) and 0.1 in
|
||||||
|
// prod (10%) to keep the Tempo backend lean. Callers can flip via
|
||||||
|
// the env var without re-deploy.
|
||||||
|
func sampleRatio() float64 {
|
||||||
|
v := os.Getenv("OTEL_TRACES_SAMPLER_ARG")
|
||||||
|
if v == "" {
|
||||||
|
// Defaults are env-aware via OTEL_DEPLOYMENT_ENV (set by the
|
||||||
|
// process supervisor); fall back to 1.0 so dev sees every
|
||||||
|
// span without ceremony.
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
var ratio float64
|
||||||
|
if _, err := fmt.Sscanf(v, "%f", &ratio); err != nil {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
if ratio < 0 {
|
||||||
|
ratio = 0
|
||||||
|
}
|
||||||
|
if ratio > 1 {
|
||||||
|
ratio = 1
|
||||||
|
}
|
||||||
|
return ratio
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue