package database // N+1 query detector — v1.0.10 ops item 11. // // N+1 is the classic GORM trap : load N parent rows, then loop and // load child rows individually instead of using Preload / a single // JOIN. The handler runs fine in dev (small table) and quietly // destroys prod once N is in the thousands. // // This file plugs into GORM's callback API to count queries per // request and compare to a threshold. The counter lives in the // request context (a simple int pointer) so it doesn't leak across // goroutines or requests. When a request finishes with a count // above threshold we log a warning + bump a Prometheus counter, // which feeds an alert rule and a Grafana panel. // // Threshold rationale : 50 queries per request is generous — // legitimate flows (admin dashboards aggregating multiple tables) // can exceed it, but the canonical "fetch list, loop, fetch each" // N+1 pattern with N ≥ 50 is rare-but-real. Tunable via // VEZA_N1_THRESHOLD ; set to 0 to disable detection without // removing the callback (which still feeds RecordDBQuery). import ( "context" "sync/atomic" "veza-backend-api/internal/metrics" "go.uber.org/zap" "gorm.io/gorm" ) // queryCounterKeyType is a private type so context-key collisions // with other packages are impossible (Go's context value lookup // uses the key's identity, not its string). type queryCounterKeyType struct{} // QueryCounterKey is the context key used to thread the per-request // query counter through GORM's callback. Exported because the // middleware (in internal/middleware) needs to put a fresh counter // in each request's ctx. var QueryCounterKey = queryCounterKeyType{} // AttachCounter returns a new context with a fresh int64 query // counter. Call from the gin middleware on every request ; the // returned ctx must be propagated into all DB calls (via // db.WithContext(ctx)) for the counter to fire. func AttachCounter(ctx context.Context) (context.Context, *int64) { var counter int64 return context.WithValue(ctx, QueryCounterKey, &counter), &counter } // CounterFromContext returns the *int64 query counter, or nil if // the request did not pass through AttachCounter (background // jobs, WebSocket handlers, etc.). Returning nil instead of // allocating-on-miss is intentional : we don't want background // flows quietly accumulating into a counter no one reads. func CounterFromContext(ctx context.Context) *int64 { v := ctx.Value(QueryCounterKey) if v == nil { return nil } if c, ok := v.(*int64); ok { return c } return nil } // N1DetectorConfig configures the GORM-callback side of the // detector. Threshold = 0 disables the warn-log path but keeps // counting (so the metric is always available for ad-hoc // investigation). type N1DetectorConfig struct { Logger *zap.Logger Threshold int64 // queries per request that triggers a warn log Enabled bool } // RegisterN1Callbacks attaches the GORM callback that increments // the per-request counter. It mirrors PerformanceMonitor's // after-callback hook layout so both can coexist. // // The callback itself does almost nothing : grab the counter from // the statement context, atomic.AddInt64. Cost is one pointer // load + one atomic add per query. At 1k QPS that's ~5µs total // CPU, negligible. func RegisterN1Callbacks(db *gorm.DB) { if db == nil { return } bump := func(d *gorm.DB) { if d.Statement == nil || d.Statement.Context == nil { return } if c := CounterFromContext(d.Statement.Context); c != nil { atomic.AddInt64(c, 1) } } // Hook every query type ; SELECT-heavy patterns are the most // common cause but UPDATE-in-loop is also a real pattern. _ = db.Callback().Query().After("gorm:query").Register("n1_detector:query", bump) _ = db.Callback().Create().After("gorm:create").Register("n1_detector:create", bump) _ = db.Callback().Update().After("gorm:update").Register("n1_detector:update", bump) _ = db.Callback().Delete().After("gorm:delete").Register("n1_detector:delete", bump) _ = db.Callback().Row().After("gorm:row").Register("n1_detector:row", bump) _ = db.Callback().Raw().After("gorm:raw").Register("n1_detector:raw", bump) } // ReportRequestQueryCount is the second half : called from the gin // middleware after the handler finishes. It reads the counter, emits // the histogram, logs a warning if over threshold, and bumps the // "n+1 suspicious" counter that drives the alert rule. Idempotent — // safe to call from a deferred middleware that fires on panic too. func ReportRequestQueryCount(cfg N1DetectorConfig, route string, count int64) { if !cfg.Enabled { return } metrics.RecordRequestQueryCount(route, count) if cfg.Threshold > 0 && count > cfg.Threshold && cfg.Logger != nil { cfg.Logger.Warn("possible N+1 query pattern", zap.String("route", route), zap.Int64("query_count", count), zap.Int64("threshold", cfg.Threshold)) metrics.RecordN1Suspicion(route) } }