Two complementary signals : pool-side (do we have enough connections
for the load?) and per-request side (does any single handler quietly
run hundreds of queries?). Both feed Prometheus + Grafana + alert
rules.
Pool stats exporter (internal/database/pool_stats_exporter.go) :
- Background goroutine ticks every 15s and feeds the existing
veza_db_connections{state} gauges. Before this, the gauges only
refreshed when /health/deep was hit, so PoolExhaustionImminent
evaluated against stale data.
- Wired into cmd/api/main.go alongside the ledger sampler with a
shutdown hook for clean cancellation.
N+1 detector (internal/database/n1_detector.go +
internal/middleware/n1_query_counter.go) :
- Per-request *int64 counter attached to ctx by the gin
middleware ; GORM after-callback (Query/Create/Update/Delete/
Row/Raw) atomic-adds.
- Cost : one pointer load + one atomic add per query.
- Cardinality bounded by c.FullPath() (templated route, not URL).
- Threshold default 50, override via VEZA_N1_THRESHOLD.
- Histogram veza_db_request_query_count + counter
veza_db_n1_suspicions_total.
Alerts in alert_rules.yml veza_db_pool_n1 group :
- PoolExhaustionImminent (in_use ≥ 90% for 5m)
- PoolStatsExporterStuck (gauges frozen for 10m despite traffic)
- N1QuerySpike (> 3% of requests over threshold for 15m)
- SlowQuerySustained (slow query rate > 2/min for 15m on same op+table)
Tests : 8 detector tests + 4 middleware tests, all pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
95 lines
2.7 KiB
Go
95 lines
2.7 KiB
Go
package middleware
|
|
|
|
import (
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"sync/atomic"
|
|
"testing"
|
|
|
|
"veza-backend-api/internal/database"
|
|
|
|
"github.com/gin-gonic/gin"
|
|
"github.com/stretchr/testify/assert"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// The middleware contract :
|
|
// 1. Disabled → pass-through, ctx is unchanged.
|
|
// 2. Enabled → handler-side ctx carries a fresh per-request counter.
|
|
// 3. After handler returns, the counter is reported regardless of
|
|
// threshold (the histogram always records ; only the log fires
|
|
// conditionally).
|
|
|
|
func TestN1QueryCounter_DisabledPassthrough(t *testing.T) {
|
|
gin.SetMode(gin.TestMode)
|
|
r := gin.New()
|
|
r.Use(N1QueryCounter(N1QueryCounterConfig{Enabled: false}))
|
|
var sawCounter bool
|
|
r.GET("/x", func(c *gin.Context) {
|
|
sawCounter = database.CounterFromContext(c.Request.Context()) != nil
|
|
c.String(http.StatusOK, "ok")
|
|
})
|
|
|
|
req := httptest.NewRequest(http.MethodGet, "/x", nil)
|
|
w := httptest.NewRecorder()
|
|
r.ServeHTTP(w, req)
|
|
|
|
assert.Equal(t, http.StatusOK, w.Code)
|
|
assert.False(t, sawCounter, "disabled middleware must NOT attach a counter")
|
|
}
|
|
|
|
func TestN1QueryCounter_EnabledAttachesCounter(t *testing.T) {
|
|
gin.SetMode(gin.TestMode)
|
|
r := gin.New()
|
|
r.Use(N1QueryCounter(N1QueryCounterConfig{
|
|
Logger: zap.NewNop(),
|
|
Threshold: 50,
|
|
Enabled: true,
|
|
}))
|
|
var observedCount int64 = -1
|
|
r.GET("/x", func(c *gin.Context) {
|
|
// Simulate a GORM callback firing 3 times during the handler.
|
|
c2 := database.CounterFromContext(c.Request.Context())
|
|
if c2 != nil {
|
|
atomic.AddInt64(c2, 3)
|
|
observedCount = atomic.LoadInt64(c2)
|
|
}
|
|
c.String(http.StatusOK, "ok")
|
|
})
|
|
|
|
req := httptest.NewRequest(http.MethodGet, "/x", nil)
|
|
w := httptest.NewRecorder()
|
|
r.ServeHTTP(w, req)
|
|
|
|
assert.Equal(t, http.StatusOK, w.Code)
|
|
assert.Equal(t, int64(3), observedCount)
|
|
}
|
|
|
|
func TestN1QueryCounter_ZeroThresholdDefaults(t *testing.T) {
|
|
// Threshold=0 must apply the 50-default rather than treating
|
|
// it as "warn on every query". The middleware constructor's
|
|
// fallback path is the contract.
|
|
gin.SetMode(gin.TestMode)
|
|
r := gin.New()
|
|
r.Use(N1QueryCounter(N1QueryCounterConfig{
|
|
Logger: zap.NewNop(),
|
|
Threshold: 0,
|
|
Enabled: true,
|
|
}))
|
|
r.GET("/x", func(c *gin.Context) {
|
|
c.String(http.StatusOK, "ok")
|
|
})
|
|
|
|
req := httptest.NewRequest(http.MethodGet, "/x", nil)
|
|
w := httptest.NewRecorder()
|
|
r.ServeHTTP(w, req)
|
|
assert.Equal(t, http.StatusOK, w.Code)
|
|
}
|
|
|
|
func TestN1ThresholdFromEnv(t *testing.T) {
|
|
assert.Equal(t, int64(50), N1ThresholdFromEnv("", 50))
|
|
assert.Equal(t, int64(50), N1ThresholdFromEnv("not-a-number", 50))
|
|
assert.Equal(t, int64(50), N1ThresholdFromEnv("-1", 50))
|
|
assert.Equal(t, int64(123), N1ThresholdFromEnv("123", 50))
|
|
assert.Equal(t, int64(0), N1ThresholdFromEnv("0", 50))
|
|
}
|