Some checks failed
Veza CI / Notify on failure (push) Blocked by required conditions
Security Scan / Secret Scanning (gitleaks) (push) Waiting to run
Veza CI / Backend (Go) (push) Has been cancelled
Veza CI / Rust (Stream Server) (push) Has been cancelled
Veza CI / Frontend (Web) (push) Has been cancelled
E2E Playwright / e2e (full) (push) Has been cancelled
Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a
collector, which tail-samples (errors + slow always, 10% rest) and
ships to Tempo. Grafana service-map dashboard pivots on the 4
instrumented hot paths.
- internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown,
BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler,
W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true
short-circuits to a no-op. Failure to dial collector is non-fatal.
- cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion
ldflag-overridable for resource attributes.
- 4 hot paths instrumented :
* handlers/auth.go::Login → "auth.login"
* core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate"
* core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook"
* handlers/search_handlers.go::Search → "search.query"
PII guarded — email masked, query content not recorded (length only).
- infra/ansible/roles/otel_collector : pin v0.116.1 contrib build,
systemd unit, tail-sampling config (errors + > 500ms always kept).
- infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend
(S3 deferred to v1.1), 14d retention.
- infra/ansible/playbooks/observability.yml : provisions both Incus
containers + applies common baseline + roles in order.
- inventory/lab.yml : new groups observability, otel_collectors, tempo.
- config/grafana/dashboards/service-map.json : node graph + 4 hot-path
span tables + collector throughput/queue panels.
- docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented.
Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab
deployment to validate with `ansible-playbook -i inventory/lab.yml
playbooks/observability.yml` once roles/postgres_ha is up.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
101 lines
3.7 KiB
JSON
101 lines
3.7 KiB
JSON
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [
|
|
{ "title": "Tempo data source", "type": "link", "url": "/explore?left=%7B%22datasource%22:%22tempo%22%7D" }
|
|
],
|
|
"liveNow": false,
|
|
"panels": [
|
|
{
|
|
"datasource": { "type": "tempo", "uid": "tempo" },
|
|
"gridPos": { "h": 14, "w": 24, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"type": "nodeGraph",
|
|
"title": "Service map (last 1h)",
|
|
"options": {},
|
|
"targets": [
|
|
{
|
|
"queryType": "serviceMap",
|
|
"refId": "A",
|
|
"datasource": { "type": "tempo", "uid": "tempo" }
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "tempo", "uid": "tempo" },
|
|
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 14 },
|
|
"id": 2,
|
|
"type": "table",
|
|
"title": "Slowest spans (auth.login + track.upload.initiate + payment.webhook + search.query)",
|
|
"options": { "showHeader": true },
|
|
"fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\"} | by(name) | aggregate(max(duration))",
|
|
"queryType": "traceql",
|
|
"tableType": "spans",
|
|
"refId": "A",
|
|
"datasource": { "type": "tempo", "uid": "tempo" }
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "tempo", "uid": "tempo" },
|
|
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 14 },
|
|
"id": 3,
|
|
"type": "table",
|
|
"title": "Recent errors on hot paths",
|
|
"options": { "showHeader": true },
|
|
"fieldConfig": { "defaults": { "custom": { "align": "auto" } }, "overrides": [] },
|
|
"targets": [
|
|
{
|
|
"query": "{name=~\"auth.login|track.upload.initiate|payment.webhook|search.query\" && status=error} | by(name)",
|
|
"queryType": "traceql",
|
|
"tableType": "spans",
|
|
"refId": "A",
|
|
"datasource": { "type": "tempo", "uid": "tempo" }
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
|
|
"id": 4,
|
|
"type": "timeseries",
|
|
"title": "OTel collector — accepted vs refused spans",
|
|
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] },
|
|
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
|
|
"targets": [
|
|
{ "expr": "sum(rate(otelcol_receiver_accepted_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "accepted", "refId": "A" },
|
|
{ "expr": "sum(rate(otelcol_receiver_refused_spans{receiver=\"otlp\"}[5m]))", "legendFormat": "refused", "refId": "B" }
|
|
]
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
|
|
"id": 5,
|
|
"type": "timeseries",
|
|
"title": "OTel collector — exporter queue depth",
|
|
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "palette-classic" } }, "overrides": [] },
|
|
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
|
|
"targets": [
|
|
{ "expr": "otelcol_exporter_queue_size", "legendFormat": "{{exporter}}", "refId": "A" },
|
|
{ "expr": "otelcol_exporter_queue_capacity", "legendFormat": "{{exporter}} capacity", "refId": "B" }
|
|
]
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 38,
|
|
"style": "dark",
|
|
"tags": ["veza", "tracing", "tempo"],
|
|
"templating": { "list": [] },
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"timepicker": {},
|
|
"timezone": "browser",
|
|
"title": "Veza Service Map (Tempo)",
|
|
"uid": "veza-service-map",
|
|
"version": 1
|
|
}
|