fix(middleware): persist maintenance flag via platform_settings table

The maintenance toggle lived in a package-level `bool` inside
`middleware/maintenance.go`. Flipping it via `PUT /admin/maintenance`
only updated the pod handling that request — the other N-1 pods stayed
open for traffic. In practice this meant deploys-in-progress or
incident playbooks silently failed to put the fleet into maintenance.

New storage:

  * Migration `976_platform_settings.sql` adds a typed key/value table
    (`value_bool` / `value_text` to avoid string parsing in the hot
    path) and seeds `maintenance_mode=false`. Idempotent on re-run.
  * `middleware/maintenance.go` rewritten around a `maintenanceState`
    with a 10s TTL cache. `InitMaintenanceMode(db, logger)` primes the
    cache at boot; `MaintenanceModeEnabled()` refreshes lazily when the
    next request lands after the TTL. Startup `MAINTENANCE_MODE` env is
    still honoured for fresh pods.
  * `router.go` calls `InitMaintenanceMode` before applying the
    `MaintenanceGin()` middleware so the first request sees DB truth.
  * `PUT /api/v1/admin/maintenance` in `routes_core.go` now does an
    `INSERT ... ON CONFLICT DO UPDATE` on the table *before* the
    in-memory setter, so the flip survives restarts and propagates to
    every pod within ~10s (one TTL window).

Tests: `TestMaintenanceGin_DBBacked` flips the DB row, waits past a
shrunk-for-test TTL, and asserts the cache picked up the change. All
four pre-existing tests preserved (`Disabled`, `Enabled_Returns503`,
`HealthExempt`, `AdminExempt`).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
senke 2026-04-16 14:57:06 +02:00
parent f80d46a153
commit 3a95e38fdf
5 changed files with 209 additions and 17 deletions

View file

@ -196,7 +196,12 @@ func (r *APIRouter) Setup(router *gin.Engine) error {
// Middlewares globaux (after CORS)
router.Use(middleware.CacheHeaders(middleware.DefaultCacheHeadersConfig())) // v0.12.4: CDN cache headers
router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin)
// v1.0.4: Back the maintenance flag with platform_settings.maintenance_mode
// so flipping it on one pod propagates to every other pod within ~10s.
if r.db != nil && r.db.GormDB != nil {
middleware.InitMaintenanceMode(r.db.GormDB, r.logger)
}
router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin)
router.Use(middleware.RequestLogger(r.logger)) // Utilisation du structured logger
router.Use(middleware.Metrics()) // Prometheus Metrics
router.Use(middleware.SentryRecover(r.logger)) // Sentry error tracking

View file

@ -419,7 +419,8 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) {
admin.GET("/reports", reportHandler.ListReports)
admin.POST("/reports/:id/resolve", reportHandler.ResolveReport)
// v0.803 ADM1-03: Maintenance mode toggle
// v0.803 ADM1-03: Maintenance mode toggle — v1.0.4: persisted via
// platform_settings so a toggle on one pod affects every other pod.
admin.PUT("/maintenance", func(c *gin.Context) {
var req struct {
Enabled bool `json:"enabled"`
@ -428,6 +429,21 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) {
c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"})
return
}
if r.db != nil && r.db.GormDB != nil {
if err := r.db.GormDB.WithContext(c.Request.Context()).Exec(
`INSERT INTO platform_settings (key, value_bool, description)
VALUES ('maintenance_mode', ?, 'When TRUE, all API requests outside the exempt list return 503.')
ON CONFLICT (key) DO UPDATE SET value_bool = EXCLUDED.value_bool, updated_at = NOW()`,
req.Enabled,
).Error; err != nil {
r.logger.Error("Failed to persist maintenance flag",
zap.Bool("enabled", req.Enabled),
zap.Error(err),
)
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to persist maintenance flag"})
return
}
}
middleware.SetMaintenanceMode(req.Enabled)
c.JSON(http.StatusOK, gin.H{"maintenance_mode": req.Enabled})
})

View file

@ -1,39 +1,134 @@
package middleware
import (
"context"
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/gin-gonic/gin"
"go.uber.org/zap"
"gorm.io/gorm"
)
// maintenanceState carries the latest cached view of the platform-wide
// maintenance flag. It is refreshed lazily from `platform_settings` when a
// request comes in after the TTL has expired, so operators flipping the flag
// on one pod propagate to every other pod within a bounded window (10s).
type maintenanceState struct {
mu sync.RWMutex
enabled bool
lastCheck time.Time
db *gorm.DB
logger *zap.Logger
ttl time.Duration
}
const defaultMaintenanceCacheTTL = 10 * time.Second
var (
maintenanceMode bool
maintenanceModeOnce sync.Once
maintenanceMu sync.RWMutex
state = &maintenanceState{ttl: defaultMaintenanceCacheTTL}
maintenanceInitMu sync.Mutex
)
func init() {
maintenanceModeOnce.Do(func() {
v := os.Getenv("MAINTENANCE_MODE")
maintenanceMode = v == "true" || v == "1"
})
v := os.Getenv("MAINTENANCE_MODE")
state.mu.Lock()
state.enabled = v == "true" || v == "1"
state.mu.Unlock()
}
// MaintenanceModeEnabled returns whether maintenance mode is active
// InitMaintenanceMode wires the DB pool so subsequent MaintenanceModeEnabled()
// calls refresh from `platform_settings.maintenance_mode` with a TTL cache.
// Safe to call more than once (last write wins). If db is nil the middleware
// falls back to the in-memory state seeded from MAINTENANCE_MODE.
func InitMaintenanceMode(db *gorm.DB, logger *zap.Logger) {
maintenanceInitMu.Lock()
defer maintenanceInitMu.Unlock()
if logger == nil {
logger = zap.NewNop()
}
state.mu.Lock()
state.db = db
state.logger = logger
state.lastCheck = time.Time{} // force refresh on first call
state.mu.Unlock()
// Prime the cache so the very first request doesn't see a stale value.
refreshFromDB(context.Background())
}
// refreshFromDB reads the current value from the DB and updates the cache.
// Never propagates errors to callers — a broken DB should not silently
// enable maintenance mode, so the previous cached value wins.
func refreshFromDB(ctx context.Context) {
state.mu.RLock()
db := state.db
logger := state.logger
state.mu.RUnlock()
if db == nil {
return
}
var row struct {
ValueBool *bool `gorm:"column:value_bool"`
}
err := db.WithContext(ctx).
Table("platform_settings").
Select("value_bool").
Where("key = ?", "maintenance_mode").
Take(&row).Error
state.mu.Lock()
state.lastCheck = time.Now()
state.mu.Unlock()
if err != nil {
if err != gorm.ErrRecordNotFound && logger != nil {
logger.Warn("Failed to refresh maintenance flag from DB — keeping cached value",
zap.Error(err),
)
}
return
}
enabled := row.ValueBool != nil && *row.ValueBool
state.mu.Lock()
state.enabled = enabled
state.mu.Unlock()
}
// MaintenanceModeEnabled returns the cached maintenance flag, refreshing from
// the DB if the TTL has expired and a DB pool has been wired.
func MaintenanceModeEnabled() bool {
maintenanceMu.RLock()
defer maintenanceMu.RUnlock()
return maintenanceMode
state.mu.RLock()
enabled := state.enabled
lastCheck := state.lastCheck
hasDB := state.db != nil
ttl := state.ttl
state.mu.RUnlock()
if hasDB && time.Since(lastCheck) > ttl {
refreshFromDB(context.Background())
state.mu.RLock()
enabled = state.enabled
state.mu.RUnlock()
}
return enabled
}
// SetMaintenanceMode sets maintenance mode (for admin toggle)
// SetMaintenanceMode sets the in-memory flag without touching the DB. It is
// kept for tests and for cases where a caller already owns the DB write — it
// does not persist the value across pods. Use PlatformSettings to change
// state across a deployment.
func SetMaintenanceMode(enabled bool) {
maintenanceMu.Lock()
defer maintenanceMu.Unlock()
maintenanceMode = enabled
state.mu.Lock()
state.enabled = enabled
state.lastCheck = time.Now().Add(state.ttl) // suppress the next DB refresh
state.mu.Unlock()
}
// MaintenanceGin returns a Gin middleware for maintenance mode.

View file

@ -4,9 +4,14 @@ import (
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap/zaptest"
"gorm.io/driver/sqlite"
"gorm.io/gorm"
)
func TestMaintenanceGin_Disabled(t *testing.T) {
@ -81,3 +86,53 @@ func TestMaintenanceGin_AdminExempt(t *testing.T) {
assert.Equal(t, http.StatusOK, w.Code)
}
// TestMaintenanceGin_DBBacked verifies that changes written to
// platform_settings propagate to MaintenanceModeEnabled() once the cache TTL
// lapses. This guards the multi-pod correctness claim of v1.0.4.
func TestMaintenanceGin_DBBacked(t *testing.T) {
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
require.NoError(t, err)
require.NoError(t, db.Exec(`
CREATE TABLE platform_settings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
key TEXT NOT NULL UNIQUE,
value_bool BOOLEAN,
value_text TEXT,
description TEXT,
updated_at DATETIME,
updated_by TEXT
)`).Error)
require.NoError(t, db.Exec(
`INSERT INTO platform_settings (key, value_bool, description) VALUES ('maintenance_mode', 0, 'test')`,
).Error)
// Start from a clean slate so no prior test leaked state into the package
// globals.
SetMaintenanceMode(false)
defer SetMaintenanceMode(false)
InitMaintenanceMode(db, zaptest.NewLogger(t))
// Shrink the TTL so we don't have to sleep 10s.
state.mu.Lock()
state.ttl = 50 * time.Millisecond
state.mu.Unlock()
defer func() {
state.mu.Lock()
state.ttl = defaultMaintenanceCacheTTL
state.db = nil
state.mu.Unlock()
}()
assert.False(t, MaintenanceModeEnabled(), "seeded value=0 should read as off")
// Flip the DB row; before TTL the cached value still says off.
require.NoError(t, db.Exec(
`UPDATE platform_settings SET value_bool = 1 WHERE key = 'maintenance_mode'`,
).Error)
assert.False(t, MaintenanceModeEnabled(), "cache should still report off before TTL")
time.Sleep(70 * time.Millisecond)
assert.True(t, MaintenanceModeEnabled(), "after TTL the refresh should pick up the new value")
}

View file

@ -0,0 +1,21 @@
-- Migration 976: Platform-wide runtime settings (v1.0.4)
-- Replaces in-memory maintenance toggle with a DB-backed key/value table so
-- all pods see the same state. Values are typed to avoid string-parsing in
-- the hot path.
CREATE TABLE IF NOT EXISTS public.platform_settings (
id SERIAL PRIMARY KEY,
key TEXT NOT NULL UNIQUE,
value_bool BOOLEAN,
value_text TEXT,
description TEXT NOT NULL DEFAULT '',
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_by UUID REFERENCES public.users(id) ON DELETE SET NULL
);
CREATE INDEX IF NOT EXISTS idx_platform_settings_key ON public.platform_settings(key);
-- Seed the maintenance_mode row; idempotent so rerunning migrations is safe.
INSERT INTO public.platform_settings (key, value_bool, description)
VALUES ('maintenance_mode', FALSE, 'When TRUE, all API requests outside the exempt list return 503.')
ON CONFLICT (key) DO NOTHING;