fix(middleware): persist maintenance flag via platform_settings table

The maintenance toggle lived in a package-level `bool` inside `middleware/maintenance.go`. Flipping it via `PUT /admin/maintenance` only updated the pod handling that request — the other N-1 pods stayed open for traffic. In practice this meant deploys-in-progress or incident playbooks silently failed to put the fleet into maintenance. New storage: * Migration `976_platform_settings.sql` adds a typed key/value table (`value_bool` / `value_text` to avoid string parsing in the hot path) and seeds `maintenance_mode=false`. Idempotent on re-run. * `middleware/maintenance.go` rewritten around a `maintenanceState` with a 10s TTL cache. `InitMaintenanceMode(db, logger)` primes the cache at boot; `MaintenanceModeEnabled()` refreshes lazily when the next request lands after the TTL. Startup `MAINTENANCE_MODE` env is still honoured for fresh pods. * `router.go` calls `InitMaintenanceMode` before applying the `MaintenanceGin()` middleware so the first request sees DB truth. * `PUT /api/v1/admin/maintenance` in `routes_core.go` now does an `INSERT ... ON CONFLICT DO UPDATE` on the table *before* the in-memory setter, so the flip survives restarts and propagates to every pod within ~10s (one TTL window). Tests: `TestMaintenanceGin_DBBacked` flips the DB row, waits past a shrunk-for-test TTL, and asserts the cache picked up the change. All four pre-existing tests preserved (`Disabled`, `Enabled_Returns503`, `HealthExempt`, `AdminExempt`). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 14:57:06 +02:00 · 2026-04-16 14:57:06 +02:00 · 3a95e38fdf
commit 3a95e38fdf
parent f80d46a153
5 changed files with 209 additions and 17 deletions
--- a/veza-backend-api/internal/api/router.go
+++ b/veza-backend-api/internal/api/router.go
@ -196,7 +196,12 @@ func (r *APIRouter) Setup(router *gin.Engine) error {

 	// Middlewares globaux (after CORS)
 	router.Use(middleware.CacheHeaders(middleware.DefaultCacheHeadersConfig())) // v0.12.4: CDN cache headers
-	router.Use(middleware.MaintenanceGin())                                     // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin)
+	// v1.0.4: Back the maintenance flag with platform_settings.maintenance_mode
+	// so flipping it on one pod propagates to every other pod within ~10s.
+	if r.db != nil && r.db.GormDB != nil {
+		middleware.InitMaintenanceMode(r.db.GormDB, r.logger)
+	}
+	router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin)
 	router.Use(middleware.RequestLogger(r.logger))                              // Utilisation du structured logger
 	router.Use(middleware.Metrics())                                            // Prometheus Metrics
 	router.Use(middleware.SentryRecover(r.logger))                              // Sentry error tracking
--- a/veza-backend-api/internal/api/routes_core.go
+++ b/veza-backend-api/internal/api/routes_core.go
@ -419,7 +419,8 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) {
 		admin.GET("/reports", reportHandler.ListReports)
 		admin.POST("/reports/:id/resolve", reportHandler.ResolveReport)

-		// v0.803 ADM1-03: Maintenance mode toggle
+		// v0.803 ADM1-03: Maintenance mode toggle — v1.0.4: persisted via
+		// platform_settings so a toggle on one pod affects every other pod.
 		admin.PUT("/maintenance", func(c *gin.Context) {
 			var req struct {
 				Enabled bool `json:"enabled"`
@ -428,6 +429,21 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) {
 				c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"})
 				return
 			}
+			if r.db != nil && r.db.GormDB != nil {
+				if err := r.db.GormDB.WithContext(c.Request.Context()).Exec(
+					`INSERT INTO platform_settings (key, value_bool, description)
+					 VALUES ('maintenance_mode', ?, 'When TRUE, all API requests outside the exempt list return 503.')
+					 ON CONFLICT (key) DO UPDATE SET value_bool = EXCLUDED.value_bool, updated_at = NOW()`,
+					req.Enabled,
+				).Error; err != nil {
+					r.logger.Error("Failed to persist maintenance flag",
+						zap.Bool("enabled", req.Enabled),
+						zap.Error(err),
+					)
+					c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to persist maintenance flag"})
+					return
+				}
+			}
 			middleware.SetMaintenanceMode(req.Enabled)
 			c.JSON(http.StatusOK, gin.H{"maintenance_mode": req.Enabled})
 		})
--- a/veza-backend-api/internal/middleware/maintenance.go
+++ b/veza-backend-api/internal/middleware/maintenance.go
@ -1,39 +1,134 @@
 package middleware

 import (
+	"context"
 	"net/http"
 	"os"
 	"strings"
 	"sync"
+	"time"

 	"github.com/gin-gonic/gin"
+	"go.uber.org/zap"
+	"gorm.io/gorm"
 )

+// maintenanceState carries the latest cached view of the platform-wide
+// maintenance flag. It is refreshed lazily from `platform_settings` when a
+// request comes in after the TTL has expired, so operators flipping the flag
+// on one pod propagate to every other pod within a bounded window (10s).
+type maintenanceState struct {
+	mu        sync.RWMutex
+	enabled   bool
+	lastCheck time.Time
+	db        *gorm.DB
+	logger    *zap.Logger
+	ttl       time.Duration
+}
+
+const defaultMaintenanceCacheTTL = 10 * time.Second
+
 var (
-	maintenanceMode     bool
-	maintenanceModeOnce sync.Once
-	maintenanceMu       sync.RWMutex
+	state             = &maintenanceState{ttl: defaultMaintenanceCacheTTL}
+	maintenanceInitMu sync.Mutex
 )

 func init() {
-	maintenanceModeOnce.Do(func() {
-		v := os.Getenv("MAINTENANCE_MODE")
-		maintenanceMode = v == "true" || v == "1"
-	})
+	v := os.Getenv("MAINTENANCE_MODE")
+	state.mu.Lock()
+	state.enabled = v == "true" || v == "1"
+	state.mu.Unlock()
 }

-// MaintenanceModeEnabled returns whether maintenance mode is active
+// InitMaintenanceMode wires the DB pool so subsequent MaintenanceModeEnabled()
+// calls refresh from `platform_settings.maintenance_mode` with a TTL cache.
+// Safe to call more than once (last write wins). If db is nil the middleware
+// falls back to the in-memory state seeded from MAINTENANCE_MODE.
+func InitMaintenanceMode(db *gorm.DB, logger *zap.Logger) {
+	maintenanceInitMu.Lock()
+	defer maintenanceInitMu.Unlock()
+
+	if logger == nil {
+		logger = zap.NewNop()
+	}
+	state.mu.Lock()
+	state.db = db
+	state.logger = logger
+	state.lastCheck = time.Time{} // force refresh on first call
+	state.mu.Unlock()
+
+	// Prime the cache so the very first request doesn't see a stale value.
+	refreshFromDB(context.Background())
+}
+
+// refreshFromDB reads the current value from the DB and updates the cache.
+// Never propagates errors to callers — a broken DB should not silently
+// enable maintenance mode, so the previous cached value wins.
+func refreshFromDB(ctx context.Context) {
+	state.mu.RLock()
+	db := state.db
+	logger := state.logger
+	state.mu.RUnlock()
+	if db == nil {
+		return
+	}
+
+	var row struct {
+		ValueBool *bool `gorm:"column:value_bool"`
+	}
+	err := db.WithContext(ctx).
+		Table("platform_settings").
+		Select("value_bool").
+		Where("key = ?", "maintenance_mode").
+		Take(&row).Error
+
+	state.mu.Lock()
+	state.lastCheck = time.Now()
+	state.mu.Unlock()
+
+	if err != nil {
+		if err != gorm.ErrRecordNotFound && logger != nil {
+			logger.Warn("Failed to refresh maintenance flag from DB — keeping cached value",
+				zap.Error(err),
+			)
+		}
+		return
+	}
+
+	enabled := row.ValueBool != nil && *row.ValueBool
+	state.mu.Lock()
+	state.enabled = enabled
+	state.mu.Unlock()
+}
+
+// MaintenanceModeEnabled returns the cached maintenance flag, refreshing from
+// the DB if the TTL has expired and a DB pool has been wired.
 func MaintenanceModeEnabled() bool {
-	maintenanceMu.RLock()
-	defer maintenanceMu.RUnlock()
-	return maintenanceMode
+	state.mu.RLock()
+	enabled := state.enabled
+	lastCheck := state.lastCheck
+	hasDB := state.db != nil
+	ttl := state.ttl
+	state.mu.RUnlock()
+
+	if hasDB && time.Since(lastCheck) > ttl {
+		refreshFromDB(context.Background())
+		state.mu.RLock()
+		enabled = state.enabled
+		state.mu.RUnlock()
+	}
+	return enabled
 }

-// SetMaintenanceMode sets maintenance mode (for admin toggle)
+// SetMaintenanceMode sets the in-memory flag without touching the DB. It is
+// kept for tests and for cases where a caller already owns the DB write — it
+// does not persist the value across pods. Use PlatformSettings to change
+// state across a deployment.
 func SetMaintenanceMode(enabled bool) {
-	maintenanceMu.Lock()
-	defer maintenanceMu.Unlock()
-	maintenanceMode = enabled
+	state.mu.Lock()
+	state.enabled = enabled
+	state.lastCheck = time.Now().Add(state.ttl) // suppress the next DB refresh
+	state.mu.Unlock()
 }

 // MaintenanceGin returns a Gin middleware for maintenance mode.
--- a/veza-backend-api/internal/middleware/maintenance_test.go
+++ b/veza-backend-api/internal/middleware/maintenance_test.go
@ -4,9 +4,14 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"

 	"github.com/gin-gonic/gin"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"go.uber.org/zap/zaptest"
+	"gorm.io/driver/sqlite"
+	"gorm.io/gorm"
 )

 func TestMaintenanceGin_Disabled(t *testing.T) {
@ -81,3 +86,53 @@ func TestMaintenanceGin_AdminExempt(t *testing.T) {

 	assert.Equal(t, http.StatusOK, w.Code)
 }
+
+// TestMaintenanceGin_DBBacked verifies that changes written to
+// platform_settings propagate to MaintenanceModeEnabled() once the cache TTL
+// lapses. This guards the multi-pod correctness claim of v1.0.4.
+func TestMaintenanceGin_DBBacked(t *testing.T) {
+	db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
+	require.NoError(t, err)
+
+	require.NoError(t, db.Exec(`
+		CREATE TABLE platform_settings (
+			id INTEGER PRIMARY KEY AUTOINCREMENT,
+			key TEXT NOT NULL UNIQUE,
+			value_bool BOOLEAN,
+			value_text TEXT,
+			description TEXT,
+			updated_at DATETIME,
+			updated_by TEXT
+		)`).Error)
+	require.NoError(t, db.Exec(
+		`INSERT INTO platform_settings (key, value_bool, description) VALUES ('maintenance_mode', 0, 'test')`,
+	).Error)
+
+	// Start from a clean slate so no prior test leaked state into the package
+	// globals.
+	SetMaintenanceMode(false)
+	defer SetMaintenanceMode(false)
+
+	InitMaintenanceMode(db, zaptest.NewLogger(t))
+	// Shrink the TTL so we don't have to sleep 10s.
+	state.mu.Lock()
+	state.ttl = 50 * time.Millisecond
+	state.mu.Unlock()
+	defer func() {
+		state.mu.Lock()
+		state.ttl = defaultMaintenanceCacheTTL
+		state.db = nil
+		state.mu.Unlock()
+	}()
+
+	assert.False(t, MaintenanceModeEnabled(), "seeded value=0 should read as off")
+
+	// Flip the DB row; before TTL the cached value still says off.
+	require.NoError(t, db.Exec(
+		`UPDATE platform_settings SET value_bool = 1 WHERE key = 'maintenance_mode'`,
+	).Error)
+	assert.False(t, MaintenanceModeEnabled(), "cache should still report off before TTL")
+
+	time.Sleep(70 * time.Millisecond)
+	assert.True(t, MaintenanceModeEnabled(), "after TTL the refresh should pick up the new value")
+}
--- a/veza-backend-api/migrations/976_platform_settings.sql
+++ b/veza-backend-api/migrations/976_platform_settings.sql
@ -0,0 +1,21 @@
+-- Migration 976: Platform-wide runtime settings (v1.0.4)
+-- Replaces in-memory maintenance toggle with a DB-backed key/value table so
+-- all pods see the same state. Values are typed to avoid string-parsing in
+-- the hot path.
+
+CREATE TABLE IF NOT EXISTS public.platform_settings (
+    id              SERIAL PRIMARY KEY,
+    key             TEXT NOT NULL UNIQUE,
+    value_bool      BOOLEAN,
+    value_text      TEXT,
+    description     TEXT NOT NULL DEFAULT '',
+    updated_at      TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    updated_by      UUID REFERENCES public.users(id) ON DELETE SET NULL
+);
+
+CREATE INDEX IF NOT EXISTS idx_platform_settings_key ON public.platform_settings(key);
+
+-- Seed the maintenance_mode row; idempotent so rerunning migrations is safe.
+INSERT INTO public.platform_settings (key, value_bool, description)
+VALUES ('maintenance_mode', FALSE, 'When TRUE, all API requests outside the exempt list return 503.')
+ON CONFLICT (key) DO NOTHING;