diff --git a/veza-backend-api/internal/api/router.go b/veza-backend-api/internal/api/router.go index 316742647..d3b670ff8 100644 --- a/veza-backend-api/internal/api/router.go +++ b/veza-backend-api/internal/api/router.go @@ -196,7 +196,12 @@ func (r *APIRouter) Setup(router *gin.Engine) error { // Middlewares globaux (after CORS) router.Use(middleware.CacheHeaders(middleware.DefaultCacheHeadersConfig())) // v0.12.4: CDN cache headers - router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin) + // v1.0.4: Back the maintenance flag with platform_settings.maintenance_mode + // so flipping it on one pod propagates to every other pod within ~10s. + if r.db != nil && r.db.GormDB != nil { + middleware.InitMaintenanceMode(r.db.GormDB, r.logger) + } + router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin) router.Use(middleware.RequestLogger(r.logger)) // Utilisation du structured logger router.Use(middleware.Metrics()) // Prometheus Metrics router.Use(middleware.SentryRecover(r.logger)) // Sentry error tracking diff --git a/veza-backend-api/internal/api/routes_core.go b/veza-backend-api/internal/api/routes_core.go index 490365720..20b65a543 100644 --- a/veza-backend-api/internal/api/routes_core.go +++ b/veza-backend-api/internal/api/routes_core.go @@ -419,7 +419,8 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) { admin.GET("/reports", reportHandler.ListReports) admin.POST("/reports/:id/resolve", reportHandler.ResolveReport) - // v0.803 ADM1-03: Maintenance mode toggle + // v0.803 ADM1-03: Maintenance mode toggle — v1.0.4: persisted via + // platform_settings so a toggle on one pod affects every other pod. admin.PUT("/maintenance", func(c *gin.Context) { var req struct { Enabled bool `json:"enabled"` @@ -428,6 +429,21 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) { c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"}) return } + if r.db != nil && r.db.GormDB != nil { + if err := r.db.GormDB.WithContext(c.Request.Context()).Exec( + `INSERT INTO platform_settings (key, value_bool, description) + VALUES ('maintenance_mode', ?, 'When TRUE, all API requests outside the exempt list return 503.') + ON CONFLICT (key) DO UPDATE SET value_bool = EXCLUDED.value_bool, updated_at = NOW()`, + req.Enabled, + ).Error; err != nil { + r.logger.Error("Failed to persist maintenance flag", + zap.Bool("enabled", req.Enabled), + zap.Error(err), + ) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to persist maintenance flag"}) + return + } + } middleware.SetMaintenanceMode(req.Enabled) c.JSON(http.StatusOK, gin.H{"maintenance_mode": req.Enabled}) }) diff --git a/veza-backend-api/internal/middleware/maintenance.go b/veza-backend-api/internal/middleware/maintenance.go index a1f34af74..7c57e5bc8 100644 --- a/veza-backend-api/internal/middleware/maintenance.go +++ b/veza-backend-api/internal/middleware/maintenance.go @@ -1,39 +1,134 @@ package middleware import ( + "context" "net/http" "os" "strings" "sync" + "time" "github.com/gin-gonic/gin" + "go.uber.org/zap" + "gorm.io/gorm" ) +// maintenanceState carries the latest cached view of the platform-wide +// maintenance flag. It is refreshed lazily from `platform_settings` when a +// request comes in after the TTL has expired, so operators flipping the flag +// on one pod propagate to every other pod within a bounded window (10s). +type maintenanceState struct { + mu sync.RWMutex + enabled bool + lastCheck time.Time + db *gorm.DB + logger *zap.Logger + ttl time.Duration +} + +const defaultMaintenanceCacheTTL = 10 * time.Second + var ( - maintenanceMode bool - maintenanceModeOnce sync.Once - maintenanceMu sync.RWMutex + state = &maintenanceState{ttl: defaultMaintenanceCacheTTL} + maintenanceInitMu sync.Mutex ) func init() { - maintenanceModeOnce.Do(func() { - v := os.Getenv("MAINTENANCE_MODE") - maintenanceMode = v == "true" || v == "1" - }) + v := os.Getenv("MAINTENANCE_MODE") + state.mu.Lock() + state.enabled = v == "true" || v == "1" + state.mu.Unlock() } -// MaintenanceModeEnabled returns whether maintenance mode is active +// InitMaintenanceMode wires the DB pool so subsequent MaintenanceModeEnabled() +// calls refresh from `platform_settings.maintenance_mode` with a TTL cache. +// Safe to call more than once (last write wins). If db is nil the middleware +// falls back to the in-memory state seeded from MAINTENANCE_MODE. +func InitMaintenanceMode(db *gorm.DB, logger *zap.Logger) { + maintenanceInitMu.Lock() + defer maintenanceInitMu.Unlock() + + if logger == nil { + logger = zap.NewNop() + } + state.mu.Lock() + state.db = db + state.logger = logger + state.lastCheck = time.Time{} // force refresh on first call + state.mu.Unlock() + + // Prime the cache so the very first request doesn't see a stale value. + refreshFromDB(context.Background()) +} + +// refreshFromDB reads the current value from the DB and updates the cache. +// Never propagates errors to callers — a broken DB should not silently +// enable maintenance mode, so the previous cached value wins. +func refreshFromDB(ctx context.Context) { + state.mu.RLock() + db := state.db + logger := state.logger + state.mu.RUnlock() + if db == nil { + return + } + + var row struct { + ValueBool *bool `gorm:"column:value_bool"` + } + err := db.WithContext(ctx). + Table("platform_settings"). + Select("value_bool"). + Where("key = ?", "maintenance_mode"). + Take(&row).Error + + state.mu.Lock() + state.lastCheck = time.Now() + state.mu.Unlock() + + if err != nil { + if err != gorm.ErrRecordNotFound && logger != nil { + logger.Warn("Failed to refresh maintenance flag from DB — keeping cached value", + zap.Error(err), + ) + } + return + } + + enabled := row.ValueBool != nil && *row.ValueBool + state.mu.Lock() + state.enabled = enabled + state.mu.Unlock() +} + +// MaintenanceModeEnabled returns the cached maintenance flag, refreshing from +// the DB if the TTL has expired and a DB pool has been wired. func MaintenanceModeEnabled() bool { - maintenanceMu.RLock() - defer maintenanceMu.RUnlock() - return maintenanceMode + state.mu.RLock() + enabled := state.enabled + lastCheck := state.lastCheck + hasDB := state.db != nil + ttl := state.ttl + state.mu.RUnlock() + + if hasDB && time.Since(lastCheck) > ttl { + refreshFromDB(context.Background()) + state.mu.RLock() + enabled = state.enabled + state.mu.RUnlock() + } + return enabled } -// SetMaintenanceMode sets maintenance mode (for admin toggle) +// SetMaintenanceMode sets the in-memory flag without touching the DB. It is +// kept for tests and for cases where a caller already owns the DB write — it +// does not persist the value across pods. Use PlatformSettings to change +// state across a deployment. func SetMaintenanceMode(enabled bool) { - maintenanceMu.Lock() - defer maintenanceMu.Unlock() - maintenanceMode = enabled + state.mu.Lock() + state.enabled = enabled + state.lastCheck = time.Now().Add(state.ttl) // suppress the next DB refresh + state.mu.Unlock() } // MaintenanceGin returns a Gin middleware for maintenance mode. diff --git a/veza-backend-api/internal/middleware/maintenance_test.go b/veza-backend-api/internal/middleware/maintenance_test.go index bd43c339f..049af8efb 100644 --- a/veza-backend-api/internal/middleware/maintenance_test.go +++ b/veza-backend-api/internal/middleware/maintenance_test.go @@ -4,9 +4,14 @@ import ( "net/http" "net/http/httptest" "testing" + "time" "github.com/gin-gonic/gin" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + "gorm.io/driver/sqlite" + "gorm.io/gorm" ) func TestMaintenanceGin_Disabled(t *testing.T) { @@ -81,3 +86,53 @@ func TestMaintenanceGin_AdminExempt(t *testing.T) { assert.Equal(t, http.StatusOK, w.Code) } + +// TestMaintenanceGin_DBBacked verifies that changes written to +// platform_settings propagate to MaintenanceModeEnabled() once the cache TTL +// lapses. This guards the multi-pod correctness claim of v1.0.4. +func TestMaintenanceGin_DBBacked(t *testing.T) { + db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{}) + require.NoError(t, err) + + require.NoError(t, db.Exec(` + CREATE TABLE platform_settings ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + key TEXT NOT NULL UNIQUE, + value_bool BOOLEAN, + value_text TEXT, + description TEXT, + updated_at DATETIME, + updated_by TEXT + )`).Error) + require.NoError(t, db.Exec( + `INSERT INTO platform_settings (key, value_bool, description) VALUES ('maintenance_mode', 0, 'test')`, + ).Error) + + // Start from a clean slate so no prior test leaked state into the package + // globals. + SetMaintenanceMode(false) + defer SetMaintenanceMode(false) + + InitMaintenanceMode(db, zaptest.NewLogger(t)) + // Shrink the TTL so we don't have to sleep 10s. + state.mu.Lock() + state.ttl = 50 * time.Millisecond + state.mu.Unlock() + defer func() { + state.mu.Lock() + state.ttl = defaultMaintenanceCacheTTL + state.db = nil + state.mu.Unlock() + }() + + assert.False(t, MaintenanceModeEnabled(), "seeded value=0 should read as off") + + // Flip the DB row; before TTL the cached value still says off. + require.NoError(t, db.Exec( + `UPDATE platform_settings SET value_bool = 1 WHERE key = 'maintenance_mode'`, + ).Error) + assert.False(t, MaintenanceModeEnabled(), "cache should still report off before TTL") + + time.Sleep(70 * time.Millisecond) + assert.True(t, MaintenanceModeEnabled(), "after TTL the refresh should pick up the new value") +} diff --git a/veza-backend-api/migrations/976_platform_settings.sql b/veza-backend-api/migrations/976_platform_settings.sql new file mode 100644 index 000000000..9ce4f5cbf --- /dev/null +++ b/veza-backend-api/migrations/976_platform_settings.sql @@ -0,0 +1,21 @@ +-- Migration 976: Platform-wide runtime settings (v1.0.4) +-- Replaces in-memory maintenance toggle with a DB-backed key/value table so +-- all pods see the same state. Values are typed to avoid string-parsing in +-- the hot path. + +CREATE TABLE IF NOT EXISTS public.platform_settings ( + id SERIAL PRIMARY KEY, + key TEXT NOT NULL UNIQUE, + value_bool BOOLEAN, + value_text TEXT, + description TEXT NOT NULL DEFAULT '', + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_by UUID REFERENCES public.users(id) ON DELETE SET NULL +); + +CREATE INDEX IF NOT EXISTS idx_platform_settings_key ON public.platform_settings(key); + +-- Seed the maintenance_mode row; idempotent so rerunning migrations is safe. +INSERT INTO public.platform_settings (key, value_bool, description) +VALUES ('maintenance_mode', FALSE, 'When TRUE, all API requests outside the exempt list return 503.') +ON CONFLICT (key) DO NOTHING;