fix(middleware): persist maintenance flag via platform_settings table
The maintenance toggle lived in a package-level `bool` inside
`middleware/maintenance.go`. Flipping it via `PUT /admin/maintenance`
only updated the pod handling that request — the other N-1 pods stayed
open for traffic. In practice this meant deploys-in-progress or
incident playbooks silently failed to put the fleet into maintenance.
New storage:
* Migration `976_platform_settings.sql` adds a typed key/value table
(`value_bool` / `value_text` to avoid string parsing in the hot
path) and seeds `maintenance_mode=false`. Idempotent on re-run.
* `middleware/maintenance.go` rewritten around a `maintenanceState`
with a 10s TTL cache. `InitMaintenanceMode(db, logger)` primes the
cache at boot; `MaintenanceModeEnabled()` refreshes lazily when the
next request lands after the TTL. Startup `MAINTENANCE_MODE` env is
still honoured for fresh pods.
* `router.go` calls `InitMaintenanceMode` before applying the
`MaintenanceGin()` middleware so the first request sees DB truth.
* `PUT /api/v1/admin/maintenance` in `routes_core.go` now does an
`INSERT ... ON CONFLICT DO UPDATE` on the table *before* the
in-memory setter, so the flip survives restarts and propagates to
every pod within ~10s (one TTL window).
Tests: `TestMaintenanceGin_DBBacked` flips the DB row, waits past a
shrunk-for-test TTL, and asserts the cache picked up the change. All
four pre-existing tests preserved (`Disabled`, `Enabled_Returns503`,
`HealthExempt`, `AdminExempt`).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
97ca5209a1
commit
1cab2a1d56
5 changed files with 209 additions and 17 deletions
|
|
@ -196,7 +196,12 @@ func (r *APIRouter) Setup(router *gin.Engine) error {
|
|||
|
||||
// Middlewares globaux (after CORS)
|
||||
router.Use(middleware.CacheHeaders(middleware.DefaultCacheHeadersConfig())) // v0.12.4: CDN cache headers
|
||||
router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin)
|
||||
// v1.0.4: Back the maintenance flag with platform_settings.maintenance_mode
|
||||
// so flipping it on one pod propagates to every other pod within ~10s.
|
||||
if r.db != nil && r.db.GormDB != nil {
|
||||
middleware.InitMaintenanceMode(r.db.GormDB, r.logger)
|
||||
}
|
||||
router.Use(middleware.MaintenanceGin()) // v0.803 ADM1-03: Maintenance mode (503 except /health, /admin)
|
||||
router.Use(middleware.RequestLogger(r.logger)) // Utilisation du structured logger
|
||||
router.Use(middleware.Metrics()) // Prometheus Metrics
|
||||
router.Use(middleware.SentryRecover(r.logger)) // Sentry error tracking
|
||||
|
|
|
|||
|
|
@ -419,7 +419,8 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) {
|
|||
admin.GET("/reports", reportHandler.ListReports)
|
||||
admin.POST("/reports/:id/resolve", reportHandler.ResolveReport)
|
||||
|
||||
// v0.803 ADM1-03: Maintenance mode toggle
|
||||
// v0.803 ADM1-03: Maintenance mode toggle — v1.0.4: persisted via
|
||||
// platform_settings so a toggle on one pod affects every other pod.
|
||||
admin.PUT("/maintenance", func(c *gin.Context) {
|
||||
var req struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
|
|
@ -428,6 +429,21 @@ func (r *APIRouter) setupCoreProtectedRoutes(v1 *gin.RouterGroup) {
|
|||
c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"})
|
||||
return
|
||||
}
|
||||
if r.db != nil && r.db.GormDB != nil {
|
||||
if err := r.db.GormDB.WithContext(c.Request.Context()).Exec(
|
||||
`INSERT INTO platform_settings (key, value_bool, description)
|
||||
VALUES ('maintenance_mode', ?, 'When TRUE, all API requests outside the exempt list return 503.')
|
||||
ON CONFLICT (key) DO UPDATE SET value_bool = EXCLUDED.value_bool, updated_at = NOW()`,
|
||||
req.Enabled,
|
||||
).Error; err != nil {
|
||||
r.logger.Error("Failed to persist maintenance flag",
|
||||
zap.Bool("enabled", req.Enabled),
|
||||
zap.Error(err),
|
||||
)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to persist maintenance flag"})
|
||||
return
|
||||
}
|
||||
}
|
||||
middleware.SetMaintenanceMode(req.Enabled)
|
||||
c.JSON(http.StatusOK, gin.H{"maintenance_mode": req.Enabled})
|
||||
})
|
||||
|
|
|
|||
|
|
@ -1,39 +1,134 @@
|
|||
package middleware
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"go.uber.org/zap"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
// maintenanceState carries the latest cached view of the platform-wide
|
||||
// maintenance flag. It is refreshed lazily from `platform_settings` when a
|
||||
// request comes in after the TTL has expired, so operators flipping the flag
|
||||
// on one pod propagate to every other pod within a bounded window (10s).
|
||||
type maintenanceState struct {
|
||||
mu sync.RWMutex
|
||||
enabled bool
|
||||
lastCheck time.Time
|
||||
db *gorm.DB
|
||||
logger *zap.Logger
|
||||
ttl time.Duration
|
||||
}
|
||||
|
||||
const defaultMaintenanceCacheTTL = 10 * time.Second
|
||||
|
||||
var (
|
||||
maintenanceMode bool
|
||||
maintenanceModeOnce sync.Once
|
||||
maintenanceMu sync.RWMutex
|
||||
state = &maintenanceState{ttl: defaultMaintenanceCacheTTL}
|
||||
maintenanceInitMu sync.Mutex
|
||||
)
|
||||
|
||||
func init() {
|
||||
maintenanceModeOnce.Do(func() {
|
||||
v := os.Getenv("MAINTENANCE_MODE")
|
||||
maintenanceMode = v == "true" || v == "1"
|
||||
})
|
||||
v := os.Getenv("MAINTENANCE_MODE")
|
||||
state.mu.Lock()
|
||||
state.enabled = v == "true" || v == "1"
|
||||
state.mu.Unlock()
|
||||
}
|
||||
|
||||
// MaintenanceModeEnabled returns whether maintenance mode is active
|
||||
// InitMaintenanceMode wires the DB pool so subsequent MaintenanceModeEnabled()
|
||||
// calls refresh from `platform_settings.maintenance_mode` with a TTL cache.
|
||||
// Safe to call more than once (last write wins). If db is nil the middleware
|
||||
// falls back to the in-memory state seeded from MAINTENANCE_MODE.
|
||||
func InitMaintenanceMode(db *gorm.DB, logger *zap.Logger) {
|
||||
maintenanceInitMu.Lock()
|
||||
defer maintenanceInitMu.Unlock()
|
||||
|
||||
if logger == nil {
|
||||
logger = zap.NewNop()
|
||||
}
|
||||
state.mu.Lock()
|
||||
state.db = db
|
||||
state.logger = logger
|
||||
state.lastCheck = time.Time{} // force refresh on first call
|
||||
state.mu.Unlock()
|
||||
|
||||
// Prime the cache so the very first request doesn't see a stale value.
|
||||
refreshFromDB(context.Background())
|
||||
}
|
||||
|
||||
// refreshFromDB reads the current value from the DB and updates the cache.
|
||||
// Never propagates errors to callers — a broken DB should not silently
|
||||
// enable maintenance mode, so the previous cached value wins.
|
||||
func refreshFromDB(ctx context.Context) {
|
||||
state.mu.RLock()
|
||||
db := state.db
|
||||
logger := state.logger
|
||||
state.mu.RUnlock()
|
||||
if db == nil {
|
||||
return
|
||||
}
|
||||
|
||||
var row struct {
|
||||
ValueBool *bool `gorm:"column:value_bool"`
|
||||
}
|
||||
err := db.WithContext(ctx).
|
||||
Table("platform_settings").
|
||||
Select("value_bool").
|
||||
Where("key = ?", "maintenance_mode").
|
||||
Take(&row).Error
|
||||
|
||||
state.mu.Lock()
|
||||
state.lastCheck = time.Now()
|
||||
state.mu.Unlock()
|
||||
|
||||
if err != nil {
|
||||
if err != gorm.ErrRecordNotFound && logger != nil {
|
||||
logger.Warn("Failed to refresh maintenance flag from DB — keeping cached value",
|
||||
zap.Error(err),
|
||||
)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
enabled := row.ValueBool != nil && *row.ValueBool
|
||||
state.mu.Lock()
|
||||
state.enabled = enabled
|
||||
state.mu.Unlock()
|
||||
}
|
||||
|
||||
// MaintenanceModeEnabled returns the cached maintenance flag, refreshing from
|
||||
// the DB if the TTL has expired and a DB pool has been wired.
|
||||
func MaintenanceModeEnabled() bool {
|
||||
maintenanceMu.RLock()
|
||||
defer maintenanceMu.RUnlock()
|
||||
return maintenanceMode
|
||||
state.mu.RLock()
|
||||
enabled := state.enabled
|
||||
lastCheck := state.lastCheck
|
||||
hasDB := state.db != nil
|
||||
ttl := state.ttl
|
||||
state.mu.RUnlock()
|
||||
|
||||
if hasDB && time.Since(lastCheck) > ttl {
|
||||
refreshFromDB(context.Background())
|
||||
state.mu.RLock()
|
||||
enabled = state.enabled
|
||||
state.mu.RUnlock()
|
||||
}
|
||||
return enabled
|
||||
}
|
||||
|
||||
// SetMaintenanceMode sets maintenance mode (for admin toggle)
|
||||
// SetMaintenanceMode sets the in-memory flag without touching the DB. It is
|
||||
// kept for tests and for cases where a caller already owns the DB write — it
|
||||
// does not persist the value across pods. Use PlatformSettings to change
|
||||
// state across a deployment.
|
||||
func SetMaintenanceMode(enabled bool) {
|
||||
maintenanceMu.Lock()
|
||||
defer maintenanceMu.Unlock()
|
||||
maintenanceMode = enabled
|
||||
state.mu.Lock()
|
||||
state.enabled = enabled
|
||||
state.lastCheck = time.Now().Add(state.ttl) // suppress the next DB refresh
|
||||
state.mu.Unlock()
|
||||
}
|
||||
|
||||
// MaintenanceGin returns a Gin middleware for maintenance mode.
|
||||
|
|
|
|||
|
|
@ -4,9 +4,14 @@ import (
|
|||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/zap/zaptest"
|
||||
"gorm.io/driver/sqlite"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
func TestMaintenanceGin_Disabled(t *testing.T) {
|
||||
|
|
@ -81,3 +86,53 @@ func TestMaintenanceGin_AdminExempt(t *testing.T) {
|
|||
|
||||
assert.Equal(t, http.StatusOK, w.Code)
|
||||
}
|
||||
|
||||
// TestMaintenanceGin_DBBacked verifies that changes written to
|
||||
// platform_settings propagate to MaintenanceModeEnabled() once the cache TTL
|
||||
// lapses. This guards the multi-pod correctness claim of v1.0.4.
|
||||
func TestMaintenanceGin_DBBacked(t *testing.T) {
|
||||
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, db.Exec(`
|
||||
CREATE TABLE platform_settings (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
key TEXT NOT NULL UNIQUE,
|
||||
value_bool BOOLEAN,
|
||||
value_text TEXT,
|
||||
description TEXT,
|
||||
updated_at DATETIME,
|
||||
updated_by TEXT
|
||||
)`).Error)
|
||||
require.NoError(t, db.Exec(
|
||||
`INSERT INTO platform_settings (key, value_bool, description) VALUES ('maintenance_mode', 0, 'test')`,
|
||||
).Error)
|
||||
|
||||
// Start from a clean slate so no prior test leaked state into the package
|
||||
// globals.
|
||||
SetMaintenanceMode(false)
|
||||
defer SetMaintenanceMode(false)
|
||||
|
||||
InitMaintenanceMode(db, zaptest.NewLogger(t))
|
||||
// Shrink the TTL so we don't have to sleep 10s.
|
||||
state.mu.Lock()
|
||||
state.ttl = 50 * time.Millisecond
|
||||
state.mu.Unlock()
|
||||
defer func() {
|
||||
state.mu.Lock()
|
||||
state.ttl = defaultMaintenanceCacheTTL
|
||||
state.db = nil
|
||||
state.mu.Unlock()
|
||||
}()
|
||||
|
||||
assert.False(t, MaintenanceModeEnabled(), "seeded value=0 should read as off")
|
||||
|
||||
// Flip the DB row; before TTL the cached value still says off.
|
||||
require.NoError(t, db.Exec(
|
||||
`UPDATE platform_settings SET value_bool = 1 WHERE key = 'maintenance_mode'`,
|
||||
).Error)
|
||||
assert.False(t, MaintenanceModeEnabled(), "cache should still report off before TTL")
|
||||
|
||||
time.Sleep(70 * time.Millisecond)
|
||||
assert.True(t, MaintenanceModeEnabled(), "after TTL the refresh should pick up the new value")
|
||||
}
|
||||
|
|
|
|||
21
veza-backend-api/migrations/976_platform_settings.sql
Normal file
21
veza-backend-api/migrations/976_platform_settings.sql
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
-- Migration 976: Platform-wide runtime settings (v1.0.4)
|
||||
-- Replaces in-memory maintenance toggle with a DB-backed key/value table so
|
||||
-- all pods see the same state. Values are typed to avoid string-parsing in
|
||||
-- the hot path.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS public.platform_settings (
|
||||
id SERIAL PRIMARY KEY,
|
||||
key TEXT NOT NULL UNIQUE,
|
||||
value_bool BOOLEAN,
|
||||
value_text TEXT,
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_by UUID REFERENCES public.users(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_platform_settings_key ON public.platform_settings(key);
|
||||
|
||||
-- Seed the maintenance_mode row; idempotent so rerunning migrations is safe.
|
||||
INSERT INTO public.platform_settings (key, value_bool, description)
|
||||
VALUES ('maintenance_mode', FALSE, 'When TRUE, all API requests outside the exempt list return 503.')
|
||||
ON CONFLICT (key) DO NOTHING;
|
||||
Loading…
Reference in a new issue