veza/veza-backend-api/internal/database/n1_detector.go

package database

// N+1 query detector — v1.0.10 ops item 11.
//
// N+1 is the classic GORM trap : load N parent rows, then loop and
// load child rows individually instead of using Preload / a single
// JOIN. The handler runs fine in dev (small table) and quietly
// destroys prod once N is in the thousands.
//
// This file plugs into GORM's callback API to count queries per
// request and compare to a threshold. The counter lives in the
// request context (a simple int pointer) so it doesn't leak across
// goroutines or requests. When a request finishes with a count
// above threshold we log a warning + bump a Prometheus counter,
// which feeds an alert rule and a Grafana panel.
//
// Threshold rationale : 50 queries per request is generous —
// legitimate flows (admin dashboards aggregating multiple tables)
// can exceed it, but the canonical "fetch list, loop, fetch each"
// N+1 pattern with N ≥ 50 is rare-but-real. Tunable via
// VEZA_N1_THRESHOLD ; set to 0 to disable detection without
// removing the callback (which still feeds RecordDBQuery).

import (
	"context"
	"sync/atomic"

	"veza-backend-api/internal/metrics"

	"go.uber.org/zap"
	"gorm.io/gorm"
)

// queryCounterKeyType is a private type so context-key collisions
// with other packages are impossible (Go's context value lookup
// uses the key's identity, not its string).
type queryCounterKeyType struct{}

// QueryCounterKey is the context key used to thread the per-request
// query counter through GORM's callback. Exported because the
// middleware (in internal/middleware) needs to put a fresh counter
// in each request's ctx.
var QueryCounterKey = queryCounterKeyType{}

// AttachCounter returns a new context with a fresh int64 query
// counter. Call from the gin middleware on every request ; the
// returned ctx must be propagated into all DB calls (via
// db.WithContext(ctx)) for the counter to fire.
func AttachCounter(ctx context.Context) (context.Context, *int64) {
	var counter int64
	return context.WithValue(ctx, QueryCounterKey, &counter), &counter
}

// CounterFromContext returns the *int64 query counter, or nil if
// the request did not pass through AttachCounter (background
// jobs, WebSocket handlers, etc.). Returning nil instead of
// allocating-on-miss is intentional : we don't want background
// flows quietly accumulating into a counter no one reads.
func CounterFromContext(ctx context.Context) *int64 {
	v := ctx.Value(QueryCounterKey)
	if v == nil {
		return nil
	}
	if c, ok := v.(*int64); ok {
		return c
	}
	return nil
}

// N1DetectorConfig configures the GORM-callback side of the
// detector. Threshold = 0 disables the warn-log path but keeps
// counting (so the metric is always available for ad-hoc
// investigation).
type N1DetectorConfig struct {
	Logger    *zap.Logger
	Threshold int64 // queries per request that triggers a warn log
	Enabled   bool
}

// RegisterN1Callbacks attaches the GORM callback that increments
// the per-request counter. It mirrors PerformanceMonitor's
// after-callback hook layout so both can coexist.
//
// The callback itself does almost nothing : grab the counter from
// the statement context, atomic.AddInt64. Cost is one pointer
// load + one atomic add per query. At 1k QPS that's ~5µs total
// CPU, negligible.
func RegisterN1Callbacks(db *gorm.DB) {
	if db == nil {
		return
	}
	bump := func(d *gorm.DB) {
		if d.Statement == nil || d.Statement.Context == nil {
			return
		}
		if c := CounterFromContext(d.Statement.Context); c != nil {
			atomic.AddInt64(c, 1)
		}
	}
	// Hook every query type ; SELECT-heavy patterns are the most
	// common cause but UPDATE-in-loop is also a real pattern.
	_ = db.Callback().Query().After("gorm:query").Register("n1_detector:query", bump)
	_ = db.Callback().Create().After("gorm:create").Register("n1_detector:create", bump)
	_ = db.Callback().Update().After("gorm:update").Register("n1_detector:update", bump)
	_ = db.Callback().Delete().After("gorm:delete").Register("n1_detector:delete", bump)
	_ = db.Callback().Row().After("gorm:row").Register("n1_detector:row", bump)
	_ = db.Callback().Raw().After("gorm:raw").Register("n1_detector:raw", bump)
}

// ReportRequestQueryCount is the second half : called from the gin
// middleware after the handler finishes. It reads the counter, emits
// the histogram, logs a warning if over threshold, and bumps the
// "n+1 suspicious" counter that drives the alert rule. Idempotent —
// safe to call from a deferred middleware that fires on panic too.
func ReportRequestQueryCount(cfg N1DetectorConfig, route string, count int64) {
	if !cfg.Enabled {
		return
	}
	metrics.RecordRequestQueryCount(route, count)
	if cfg.Threshold > 0 && count > cfg.Threshold && cfg.Logger != nil {
		cfg.Logger.Warn("possible N+1 query pattern",
			zap.String("route", route),
			zap.Int64("query_count", count),
			zap.Int64("threshold", cfg.Threshold))
		metrics.RecordN1Suspicion(route)
	}
}