From 3a95e38fdfa451fe718c06a7ae348fc385061f6c Mon Sep 17 00:00:00 2001 From: senke Date: Thu, 16 Apr 2026 14:57:06 +0200 Subject: [PATCH] fix(middleware): persist maintenance flag via platform_settings table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The maintenance toggle lived in a package-level `bool` inside `middleware/maintenance.go`. Flipping it via `PUT /admin/maintenance` only updated the pod handling that request — the other N-1 pods stayed open for traffic. In practice this meant deploys-in-progress or incident playbooks silently failed to put the fleet into maintenance. New storage: * Migration `976_platform_settings.sql` adds a typed key/value table (`value_bool` / `value_text` to avoid string parsing in the hot path) and seeds `maintenance_mode=false`. Idempotent on re-run. * `middleware/maintenance.go` rewritten around a `maintenanceState` with a 10s TTL cache. `InitMaintenanceMode(db, logger)` primes the cache at boot; `MaintenanceModeEnabled()` refreshes lazily when the next request lands after the TTL. Startup `MAINTENANCE_MODE` env is still honoured for fresh pods. * `router.go` calls `InitMaintenanceMode` before applying the `MaintenanceGin()` middleware so the first request sees DB truth. * `PUT /api/v1/admin/maintenance` in `routes_core.go` now does an `INSERT ... ON CONFLICT DO UPDATE` on the table *before* the in-memory setter, so the flip survives restarts and propagates to every pod within ~10s (one TTL window). Tests: `TestMaintenanceGin_DBBacked` flips the DB row, waits past a shrunk-for-test TTL, and asserts the cache picked up the change. All four pre-existing tests preserved (`Disabled`, `Enabled_Returns503`, `HealthExempt`, `AdminExempt`). Co-Authored-By: Claude Opus 4.6 (1M context) --- veza-backend-api/internal/api/router.go | 7 +- veza-backend-api/internal/api/routes_core.go | 18 ++- .../internal/middleware/maintenance.go | 125 +++++++++++++++--- .../internal/middleware/maintenance_test.go | 55 ++++++++ .../migrations/976_platform_settings.sql | 21 +++ 5 files changed, 209 insertions(+), 17 deletions(-) create mode 100644 veza-backend-api/migrations/976_platform_settings.sql diff --git a/veza-backend-api/internal/api/router.go b/veza-backend-api/internal/api/router.go index 316742647..d3b670ff8 100644 --- a/veza-backend-api/internal/api/router.go +++ b/veza-backend-api/internal/api/router.go @@ -196,7 +196,12 @@ func (r *APIRouter) Setup(router *gin.Engine) error { // Middlewares globaux (after CORS) router.Use(middleware.CacheHeaders(middleware.DefaultCacheHeadersConfig())) // v0.12.4: CDN cache headers - router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin) + // v1.0.4: Back the maintenance flag with platform_settings.maintenance_mode + // so flipping it on one pod propagates to every other pod within ~10s. + if r.db != nil && r.db.GormDB != nil { + middleware.InitMaintenanceMode(r.db.GormDB, r.logger) + } + router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin) router.Use(middleware.RequestLogger(r.logger)) // Utilisation du structured logger router.Use(middleware.Metrics()) // Prometheus Metrics router.Use(middleware.SentryRecover(r.logger)) // Sentry error tracking diff --git a/veza-backend-api/internal/api/routes_core.go b/veza-backend-api/internal/api/routes_core.go index 490365720..20b65a543 100644 --- a/veza-backend-api/internal/api/routes_core.go +++ b/veza-backend-api/internal/api/routes_core.go @@ -419,7 +419,8 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) { admin.GET("/reports", reportHandler.ListReports) admin.POST("/reports/:id/resolve", reportHandler.ResolveReport) - // v0.803 ADM1-03: Maintenance mode toggle + // v0.803 ADM1-03: Maintenance mode toggle — v1.0.4: persisted via + // platform_settings so a toggle on one pod affects every other pod. admin.PUT("/maintenance", func(c *gin.Context) { var req struct { Enabled bool `json:"enabled"` @@ -428,6 +429,21 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) { c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"}) return } + if r.db != nil && r.db.GormDB != nil { + if err := r.db.GormDB.WithContext(c.Request.Context()).Exec( + `INSERT INTO platform_settings (key, value_bool, description) + VALUES ('maintenance_mode', ?, 'When TRUE, all API requests outside the exempt list return 503.') + ON CONFLICT (key) DO UPDATE SET value_bool = EXCLUDED.value_bool, updated_at = NOW()`, + req.Enabled, + ).Error; err != nil { + r.logger.Error("Failed to persist maintenance flag", + zap.Bool("enabled", req.Enabled), + zap.Error(err), + ) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to persist maintenance flag"}) + return + } + } middleware.SetMaintenanceMode(req.Enabled) c.JSON(http.StatusOK, gin.H{"maintenance_mode": req.Enabled}) }) diff --git a/veza-backend-api/internal/middleware/maintenance.go b/veza-backend-api/internal/middleware/maintenance.go index a1f34af74..7c57e5bc8 100644 --- a/veza-backend-api/internal/middleware/maintenance.go +++ b/veza-backend-api/internal/middleware/maintenance.go @@ -1,39 +1,134 @@ package middleware import ( + "context" "net/http" "os" "strings" "sync" + "time" "github.com/gin-gonic/gin" + "go.uber.org/zap" + "gorm.io/gorm" ) +// maintenanceState carries the latest cached view of the platform-wide +// maintenance flag. It is refreshed lazily from `platform_settings` when a +// request comes in after the TTL has expired, so operators flipping the flag +// on one pod propagate to every other pod within a bounded window (10s). +type maintenanceState struct { + mu sync.RWMutex + enabled bool + lastCheck time.Time + db *gorm.DB + logger *zap.Logger + ttl time.Duration +} + +const defaultMaintenanceCacheTTL = 10 * time.Second + var ( - maintenanceMode bool - maintenanceModeOnce sync.Once - maintenanceMu sync.RWMutex + state = &maintenanceState{ttl: defaultMaintenanceCacheTTL} + maintenanceInitMu sync.Mutex ) func init() { - maintenanceModeOnce.Do(func() { - v := os.Getenv("MAINTENANCE_MODE") - maintenanceMode = v == "true" || v == "1" - }) + v := os.Getenv("MAINTENANCE_MODE") + state.mu.Lock() + state.enabled = v == "true" || v == "1" + state.mu.Unlock() } -// MaintenanceModeEnabled returns whether maintenance mode is active +// InitMaintenanceMode wires the DB pool so subsequent MaintenanceModeEnabled() +// calls refresh from `platform_settings.maintenance_mode` with a TTL cache. +// Safe to call more than once (last write wins). If db is nil the middleware +// falls back to the in-memory state seeded from MAINTENANCE_MODE. +func InitMaintenanceMode(db *gorm.DB, logger *zap.Logger) { + maintenanceInitMu.Lock() + defer maintenanceInitMu.Unlock() + + if logger == nil { + logger = zap.NewNop() + } + state.mu.Lock() + state.db = db + state.logger = logger + state.lastCheck = time.Time{} // force refresh on first call + state.mu.Unlock() + + // Prime the cache so the very first request doesn't see a stale value. + refreshFromDB(context.Background()) +} + +// refreshFromDB reads the current value from the DB and updates the cache. +// Never propagates errors to callers — a broken DB should not silently +// enable maintenance mode, so the previous cached value wins. +func refreshFromDB(ctx context.Context) { + state.mu.RLock() + db := state.db + logger := state.logger + state.mu.RUnlock() + if db == nil { + return + } + + var row struct { + ValueBool *bool `gorm:"column:value_bool"` + } + err := db.WithContext(ctx). + Table("platform_settings"). + Select("value_bool"). + Where("key = ?", "maintenance_mode"). + Take(&row).Error + + state.mu.Lock() + state.lastCheck = time.Now() + state.mu.Unlock() + + if err != nil { + if err != gorm.ErrRecordNotFound && logger != nil { + logger.Warn("Failed to refresh maintenance flag from DB — keeping cached value", + zap.Error(err), + ) + } + return + } + + enabled := row.ValueBool != nil && *row.ValueBool + state.mu.Lock() + state.enabled = enabled + state.mu.Unlock() +} + +// MaintenanceModeEnabled returns the cached maintenance flag, refreshing from +// the DB if the TTL has expired and a DB pool has been wired. func MaintenanceModeEnabled() bool { - maintenanceMu.RLock() - defer maintenanceMu.RUnlock() - return maintenanceMode + state.mu.RLock() + enabled := state.enabled + lastCheck := state.lastCheck + hasDB := state.db != nil + ttl := state.ttl + state.mu.RUnlock() + + if hasDB && time.Since(lastCheck) > ttl { + refreshFromDB(context.Background()) + state.mu.RLock() + enabled = state.enabled + state.mu.RUnlock() + } + return enabled } -// SetMaintenanceMode sets maintenance mode (for admin toggle) +// SetMaintenanceMode sets the in-memory flag without touching the DB. It is +// kept for tests and for cases where a caller already owns the DB write — it +// does not persist the value across pods. Use PlatformSettings to change +// state across a deployment. func SetMaintenanceMode(enabled bool) { - maintenanceMu.Lock() - defer maintenanceMu.Unlock() - maintenanceMode = enabled + state.mu.Lock() + state.enabled = enabled + state.lastCheck = time.Now().Add(state.ttl) // suppress the next DB refresh + state.mu.Unlock() } // MaintenanceGin returns a Gin middleware for maintenance mode. diff --git a/veza-backend-api/internal/middleware/maintenance_test.go b/veza-backend-api/internal/middleware/maintenance_test.go index bd43c339f..049af8efb 100644 --- a/veza-backend-api/internal/middleware/maintenance_test.go +++ b/veza-backend-api/internal/middleware/maintenance_test.go @@ -4,9 +4,14 @@ import ( "net/http" "net/http/httptest" "testing" + "time" "github.com/gin-gonic/gin" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + "gorm.io/driver/sqlite" + "gorm.io/gorm" ) func TestMaintenanceGin_Disabled(t *testing.T) { @@ -81,3 +86,53 @@ func TestMaintenanceGin_AdminExempt(t *testing.T) { assert.Equal(t, http.StatusOK, w.Code) } + +// TestMaintenanceGin_DBBacked verifies that changes written to +// platform_settings propagate to MaintenanceModeEnabled() once the cache TTL +// lapses. This guards the multi-pod correctness claim of v1.0.4. +func TestMaintenanceGin_DBBacked(t *testing.T) { + db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{}) + require.NoError(t, err) + + require.NoError(t, db.Exec(` + CREATE TABLE platform_settings ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + key TEXT NOT NULL UNIQUE, + value_bool BOOLEAN, + value_text TEXT, + description TEXT, + updated_at DATETIME, + updated_by TEXT + )`).Error) + require.NoError(t, db.Exec( + `INSERT INTO platform_settings (key, value_bool, description) VALUES ('maintenance_mode', 0, 'test')`, + ).Error) + + // Start from a clean slate so no prior test leaked state into the package + // globals. + SetMaintenanceMode(false) + defer SetMaintenanceMode(false) + + InitMaintenanceMode(db, zaptest.NewLogger(t)) + // Shrink the TTL so we don't have to sleep 10s. + state.mu.Lock() + state.ttl = 50 * time.Millisecond + state.mu.Unlock() + defer func() { + state.mu.Lock() + state.ttl = defaultMaintenanceCacheTTL + state.db = nil + state.mu.Unlock() + }() + + assert.False(t, MaintenanceModeEnabled(), "seeded value=0 should read as off") + + // Flip the DB row; before TTL the cached value still says off. + require.NoError(t, db.Exec( + `UPDATE platform_settings SET value_bool = 1 WHERE key = 'maintenance_mode'`, + ).Error) + assert.False(t, MaintenanceModeEnabled(), "cache should still report off before TTL") + + time.Sleep(70 * time.Millisecond) + assert.True(t, MaintenanceModeEnabled(), "after TTL the refresh should pick up the new value") +} diff --git a/veza-backend-api/migrations/976_platform_settings.sql b/veza-backend-api/migrations/976_platform_settings.sql new file mode 100644 index 000000000..9ce4f5cbf --- /dev/null +++ b/veza-backend-api/migrations/976_platform_settings.sql @@ -0,0 +1,21 @@ +-- Migration 976: Platform-wide runtime settings (v1.0.4) +-- Replaces in-memory maintenance toggle with a DB-backed key/value table so +-- all pods see the same state. Values are typed to avoid string-parsing in +-- the hot path. + +CREATE TABLE IF NOT EXISTS public.platform_settings ( + id SERIAL PRIMARY KEY, + key TEXT NOT NULL UNIQUE, + value_bool BOOLEAN, + value_text TEXT, + description TEXT NOT NULL DEFAULT '', + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_by UUID REFERENCES public.users(id) ON DELETE SET NULL +); + +CREATE INDEX IF NOT EXISTS idx_platform_settings_key ON public.platform_settings(key); + +-- Seed the maintenance_mode row; idempotent so rerunning migrations is safe. +INSERT INTO public.platform_settings (key, value_bool, description) +VALUES ('maintenance_mode', FALSE, 'When TRUE, all API requests outside the exempt list return 503.') +ON CONFLICT (key) DO NOTHING;