veza/veza-backend-api/internal/database/pool_stats_exporter.go

package database

// Periodic DB pool stats exporter — v1.0.10 ops item 11.
//
// Before this file the pool gauges (veza_db_connections{state}) were
// only updated when the /health/deep endpoint was hit. That meant
// Prometheus scrapes between health checks saw stale numbers. The
// existing gauges + UpdateDBConnections() were already there ; this
// just feeds them on a schedule so PoolExhaustionImminent and the
// "in_use ≥ MaxOpen × 0.9" alert have fresh data to evaluate.
//
// Why a goroutine and not a Prometheus collector callback : sql.DB
// doesn't expose a cheap polling hook, and the callback model would
// also force us to hold the DB handle in the metrics package which
// is the wrong layering.

import (
	"context"
	"time"

	"go.uber.org/zap"
	"gorm.io/gorm"
)

// StartPoolStatsExporter launches a background goroutine that calls
// GetPoolStats(db) on the given interval. The goroutine exits cleanly
// when ctx is cancelled (typical: server shutdown). interval ≤ 0
// falls back to 15s — short enough that pool exhaustion shows up
// before the alert's 5m for: window, long enough that the overhead
// is negligible (a single sql.DB.Stats() call costs nanoseconds).
//
// Returns immediately ; the caller is responsible for cancelling
// ctx on shutdown.
func StartPoolStatsExporter(ctx context.Context, db *gorm.DB, interval time.Duration, logger *zap.Logger) {
	if db == nil {
		if logger != nil {
			logger.Warn("pool stats exporter not started: db is nil")
		}
		return
	}
	if interval <= 0 {
		interval = 15 * time.Second
	}

	go func() {
		ticker := time.NewTicker(interval)
		defer ticker.Stop()

		// Emit once immediately so the first scrape after startup
		// has a non-stale value (otherwise the gauge sits at 0
		// for up-to-`interval` seconds and the alert's
		// availability-vs-saturation calc reads wrong).
		if _, err := GetPoolStats(db); err != nil && logger != nil {
			logger.Debug("pool stats initial sample failed", zap.Error(err))
		}

		for {
			select {
			case <-ctx.Done():
				if logger != nil {
					logger.Debug("pool stats exporter stopped")
				}
				return
			case <-ticker.C:
				if _, err := GetPoolStats(db); err != nil && logger != nil {
					logger.Debug("pool stats sample failed", zap.Error(err))
				}
			}
		}
	}()

	if logger != nil {
		logger.Info("pool stats exporter started", zap.Duration("interval", interval))
	}
}