Two complementary signals : pool-side (do we have enough connections
for the load?) and per-request side (does any single handler quietly
run hundreds of queries?). Both feed Prometheus + Grafana + alert
rules.
Pool stats exporter (internal/database/pool_stats_exporter.go) :
- Background goroutine ticks every 15s and feeds the existing
veza_db_connections{state} gauges. Before this, the gauges only
refreshed when /health/deep was hit, so PoolExhaustionImminent
evaluated against stale data.
- Wired into cmd/api/main.go alongside the ledger sampler with a
shutdown hook for clean cancellation.
N+1 detector (internal/database/n1_detector.go +
internal/middleware/n1_query_counter.go) :
- Per-request *int64 counter attached to ctx by the gin
middleware ; GORM after-callback (Query/Create/Update/Delete/
Row/Raw) atomic-adds.
- Cost : one pointer load + one atomic add per query.
- Cardinality bounded by c.FullPath() (templated route, not URL).
- Threshold default 50, override via VEZA_N1_THRESHOLD.
- Histogram veza_db_request_query_count + counter
veza_db_n1_suspicions_total.
Alerts in alert_rules.yml veza_db_pool_n1 group :
- PoolExhaustionImminent (in_use ≥ 90% for 5m)
- PoolStatsExporterStuck (gauges frozen for 10m despite traffic)
- N1QuerySpike (> 3% of requests over threshold for 15m)
- SlowQuerySustained (slow query rate > 2/min for 15m on same op+table)
Tests : 8 detector tests + 4 middleware tests, all pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
167 lines
5.7 KiB
Go
167 lines
5.7 KiB
Go
package metrics
|
||
|
||
import (
|
||
"strconv"
|
||
"time"
|
||
|
||
"veza-backend-api/internal/errors"
|
||
|
||
"github.com/prometheus/client_golang/prometheus"
|
||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||
)
|
||
|
||
var (
|
||
// errorsTotal compte le total d'erreurs par code d'erreur et status HTTP
|
||
errorsTotal = promauto.NewCounterVec(
|
||
prometheus.CounterOpts{
|
||
Name: "veza_errors_legacy_total",
|
||
Help: "Total number of errors by code and HTTP status",
|
||
},
|
||
[]string{"error_code", "http_status"},
|
||
)
|
||
|
||
// errorsByCode compte les erreurs par code d'erreur
|
||
errorsByCode = promauto.NewCounterVec(
|
||
prometheus.CounterOpts{
|
||
Name: "veza_errors_by_code_total",
|
||
Help: "Total number of errors by error code",
|
||
},
|
||
[]string{"error_code"},
|
||
)
|
||
|
||
// errorsByHTTPStatus compte les erreurs par status HTTP
|
||
errorsByHTTPStatus = promauto.NewCounterVec(
|
||
prometheus.CounterOpts{
|
||
Name: "veza_errors_by_http_status_total",
|
||
Help: "Total number of errors by HTTP status code",
|
||
},
|
||
[]string{"http_status"},
|
||
)
|
||
|
||
// dbQueriesTotal compte le total de requêtes DB par opération et table
|
||
dbQueriesTotal = promauto.NewCounterVec(
|
||
prometheus.CounterOpts{
|
||
Name: "veza_db_queries_total",
|
||
Help: "Total number of database queries",
|
||
},
|
||
[]string{"operation", "table"},
|
||
)
|
||
|
||
// dbQueryDuration mesure la durée des requêtes DB
|
||
dbQueryDuration = promauto.NewHistogramVec(
|
||
prometheus.HistogramOpts{
|
||
Name: "veza_db_query_duration_seconds",
|
||
Help: "Database query duration in seconds",
|
||
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5},
|
||
},
|
||
[]string{"operation", "table"},
|
||
)
|
||
|
||
// dbConnections mesure le nombre de connexions DB par état
|
||
dbConnections = promauto.NewGaugeVec(
|
||
prometheus.GaugeOpts{
|
||
Name: "veza_db_connections",
|
||
Help: "Number of database connections",
|
||
},
|
||
[]string{"state"}, // open, idle, in_use
|
||
)
|
||
|
||
// dbSlowQueries compte les requêtes lentes par opération et table
|
||
// BE-DB-018: Slow query logging and performance metrics
|
||
dbSlowQueries = promauto.NewCounterVec(
|
||
prometheus.CounterOpts{
|
||
Name: "veza_db_slow_queries_total",
|
||
Help: "Total number of slow database queries",
|
||
},
|
||
[]string{"operation", "table"},
|
||
)
|
||
|
||
// dbSlowQueryDuration mesure la durée des requêtes lentes
|
||
dbSlowQueryDuration = promauto.NewHistogramVec(
|
||
prometheus.HistogramOpts{
|
||
Name: "veza_db_slow_query_duration_seconds",
|
||
Help: "Slow database query duration in seconds",
|
||
Buckets: []float64{1, 2, 5, 10, 30, 60, 120}, // Buckets for slow queries (seconds)
|
||
},
|
||
[]string{"operation", "table"},
|
||
)
|
||
|
||
// dbRequestQueryCount distribution of how many DB queries each
|
||
// HTTP request triggers. p99 is what catches N+1 patterns —
|
||
// a healthy endpoint runs 1–10 queries ; a regressed one runs
|
||
// hundreds. v1.0.10 ops item 11.
|
||
dbRequestQueryCount = promauto.NewHistogramVec(
|
||
prometheus.HistogramOpts{
|
||
Name: "veza_db_request_query_count",
|
||
Help: "Number of database queries executed during a single HTTP request, by route.",
|
||
Buckets: []float64{1, 5, 10, 20, 50, 100, 200, 500, 1000},
|
||
},
|
||
[]string{"route"},
|
||
)
|
||
|
||
// dbN1SuspicionsTotal counts requests that crossed the per-
|
||
// request query-count threshold. v1.0.10 ops item 11. The
|
||
// counter labels by route so the alert can pinpoint which
|
||
// handler regressed (a broad spike means an upstream bug ; a
|
||
// single-route spike means that handler).
|
||
dbN1SuspicionsTotal = promauto.NewCounterVec(
|
||
prometheus.CounterOpts{
|
||
Name: "veza_db_n1_suspicions_total",
|
||
Help: "Requests where DB query count exceeded the N+1 detection threshold.",
|
||
},
|
||
[]string{"route"},
|
||
)
|
||
)
|
||
|
||
// RecordErrorPrometheus enregistre une erreur dans Prometheus
|
||
func RecordErrorPrometheus(code errors.ErrorCode, httpStatus int) {
|
||
codeStr := strconv.Itoa(int(code))
|
||
statusStr := strconv.Itoa(httpStatus)
|
||
|
||
errorsTotal.WithLabelValues(codeStr, statusStr).Inc()
|
||
errorsByCode.WithLabelValues(codeStr).Inc()
|
||
errorsByHTTPStatus.WithLabelValues(statusStr).Inc()
|
||
}
|
||
|
||
// RecordDBQuery enregistre une requête DB dans Prometheus
|
||
// operation: type d'opération (SELECT, INSERT, UPDATE, DELETE, etc.)
|
||
// table: nom de la table (ou "unknown" si non disponible)
|
||
// duration: durée de la requête
|
||
func RecordDBQuery(operation, table string, duration time.Duration) {
|
||
dbQueriesTotal.WithLabelValues(operation, table).Inc()
|
||
dbQueryDuration.WithLabelValues(operation, table).Observe(duration.Seconds())
|
||
}
|
||
|
||
// UpdateDBConnections met à jour les métriques de connexions DB
|
||
// open: nombre total de connexions ouvertes
|
||
// idle: nombre de connexions inactives
|
||
// inUse: nombre de connexions en cours d'utilisation
|
||
func UpdateDBConnections(open, idle, inUse int) {
|
||
dbConnections.WithLabelValues("open").Set(float64(open))
|
||
dbConnections.WithLabelValues("idle").Set(float64(idle))
|
||
dbConnections.WithLabelValues("in_use").Set(float64(inUse))
|
||
}
|
||
|
||
// RecordSlowQuery enregistre une requête lente dans Prometheus
|
||
// BE-DB-018: Slow query logging and performance metrics
|
||
// operation: type d'opération (SELECT, INSERT, UPDATE, DELETE, etc.)
|
||
// table: nom de la table (ou "unknown" si non disponible)
|
||
// duration: durée de la requête
|
||
func RecordSlowQuery(operation, table string, duration time.Duration) {
|
||
dbSlowQueries.WithLabelValues(operation, table).Inc()
|
||
dbSlowQueryDuration.WithLabelValues(operation, table).Observe(duration.Seconds())
|
||
}
|
||
|
||
// RecordRequestQueryCount records the number of DB queries a single
|
||
// request executed. Called from the gin middleware after the handler
|
||
// returns. v1.0.10 ops item 11.
|
||
func RecordRequestQueryCount(route string, count int64) {
|
||
dbRequestQueryCount.WithLabelValues(route).Observe(float64(count))
|
||
}
|
||
|
||
// RecordN1Suspicion bumps the per-route suspicion counter when a
|
||
// request's query count crosses the configured threshold. v1.0.10
|
||
// ops item 11.
|
||
func RecordN1Suspicion(route string) {
|
||
dbN1SuspicionsTotal.WithLabelValues(route).Inc()
|
||
}
|