The maintenance toggle lived in a package-level `bool` inside
`middleware/maintenance.go`. Flipping it via `PUT /admin/maintenance`
only updated the pod handling that request — the other N-1 pods stayed
open for traffic. In practice this meant deploys-in-progress or
incident playbooks silently failed to put the fleet into maintenance.
New storage:
* Migration `976_platform_settings.sql` adds a typed key/value table
(`value_bool` / `value_text` to avoid string parsing in the hot
path) and seeds `maintenance_mode=false`. Idempotent on re-run.
* `middleware/maintenance.go` rewritten around a `maintenanceState`
with a 10s TTL cache. `InitMaintenanceMode(db, logger)` primes the
cache at boot; `MaintenanceModeEnabled()` refreshes lazily when the
next request lands after the TTL. Startup `MAINTENANCE_MODE` env is
still honoured for fresh pods.
* `router.go` calls `InitMaintenanceMode` before applying the
`MaintenanceGin()` middleware so the first request sees DB truth.
* `PUT /api/v1/admin/maintenance` in `routes_core.go` now does an
`INSERT ... ON CONFLICT DO UPDATE` on the table *before* the
in-memory setter, so the flip survives restarts and propagates to
every pod within ~10s (one TTL window).
Tests: `TestMaintenanceGin_DBBacked` flips the DB row, waits past a
shrunk-for-test TTL, and asserts the cache picked up the change. All
four pre-existing tests preserved (`Disabled`, `Enabled_Returns503`,
`HealthExempt`, `AdminExempt`).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
166 lines
4.4 KiB
Go
166 lines
4.4 KiB
Go
package middleware
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gin-gonic/gin"
|
|
"go.uber.org/zap"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// maintenanceState carries the latest cached view of the platform-wide
|
|
// maintenance flag. It is refreshed lazily from `platform_settings` when a
|
|
// request comes in after the TTL has expired, so operators flipping the flag
|
|
// on one pod propagate to every other pod within a bounded window (10s).
|
|
type maintenanceState struct {
|
|
mu sync.RWMutex
|
|
enabled bool
|
|
lastCheck time.Time
|
|
db *gorm.DB
|
|
logger *zap.Logger
|
|
ttl time.Duration
|
|
}
|
|
|
|
const defaultMaintenanceCacheTTL = 10 * time.Second
|
|
|
|
var (
|
|
state = &maintenanceState{ttl: defaultMaintenanceCacheTTL}
|
|
maintenanceInitMu sync.Mutex
|
|
)
|
|
|
|
func init() {
|
|
v := os.Getenv("MAINTENANCE_MODE")
|
|
state.mu.Lock()
|
|
state.enabled = v == "true" || v == "1"
|
|
state.mu.Unlock()
|
|
}
|
|
|
|
// InitMaintenanceMode wires the DB pool so subsequent MaintenanceModeEnabled()
|
|
// calls refresh from `platform_settings.maintenance_mode` with a TTL cache.
|
|
// Safe to call more than once (last write wins). If db is nil the middleware
|
|
// falls back to the in-memory state seeded from MAINTENANCE_MODE.
|
|
func InitMaintenanceMode(db *gorm.DB, logger *zap.Logger) {
|
|
maintenanceInitMu.Lock()
|
|
defer maintenanceInitMu.Unlock()
|
|
|
|
if logger == nil {
|
|
logger = zap.NewNop()
|
|
}
|
|
state.mu.Lock()
|
|
state.db = db
|
|
state.logger = logger
|
|
state.lastCheck = time.Time{} // force refresh on first call
|
|
state.mu.Unlock()
|
|
|
|
// Prime the cache so the very first request doesn't see a stale value.
|
|
refreshFromDB(context.Background())
|
|
}
|
|
|
|
// refreshFromDB reads the current value from the DB and updates the cache.
|
|
// Never propagates errors to callers — a broken DB should not silently
|
|
// enable maintenance mode, so the previous cached value wins.
|
|
func refreshFromDB(ctx context.Context) {
|
|
state.mu.RLock()
|
|
db := state.db
|
|
logger := state.logger
|
|
state.mu.RUnlock()
|
|
if db == nil {
|
|
return
|
|
}
|
|
|
|
var row struct {
|
|
ValueBool *bool `gorm:"column:value_bool"`
|
|
}
|
|
err := db.WithContext(ctx).
|
|
Table("platform_settings").
|
|
Select("value_bool").
|
|
Where("key = ?", "maintenance_mode").
|
|
Take(&row).Error
|
|
|
|
state.mu.Lock()
|
|
state.lastCheck = time.Now()
|
|
state.mu.Unlock()
|
|
|
|
if err != nil {
|
|
if err != gorm.ErrRecordNotFound && logger != nil {
|
|
logger.Warn("Failed to refresh maintenance flag from DB — keeping cached value",
|
|
zap.Error(err),
|
|
)
|
|
}
|
|
return
|
|
}
|
|
|
|
enabled := row.ValueBool != nil && *row.ValueBool
|
|
state.mu.Lock()
|
|
state.enabled = enabled
|
|
state.mu.Unlock()
|
|
}
|
|
|
|
// MaintenanceModeEnabled returns the cached maintenance flag, refreshing from
|
|
// the DB if the TTL has expired and a DB pool has been wired.
|
|
func MaintenanceModeEnabled() bool {
|
|
state.mu.RLock()
|
|
enabled := state.enabled
|
|
lastCheck := state.lastCheck
|
|
hasDB := state.db != nil
|
|
ttl := state.ttl
|
|
state.mu.RUnlock()
|
|
|
|
if hasDB && time.Since(lastCheck) > ttl {
|
|
refreshFromDB(context.Background())
|
|
state.mu.RLock()
|
|
enabled = state.enabled
|
|
state.mu.RUnlock()
|
|
}
|
|
return enabled
|
|
}
|
|
|
|
// SetMaintenanceMode sets the in-memory flag without touching the DB. It is
|
|
// kept for tests and for cases where a caller already owns the DB write — it
|
|
// does not persist the value across pods. Use PlatformSettings to change
|
|
// state across a deployment.
|
|
func SetMaintenanceMode(enabled bool) {
|
|
state.mu.Lock()
|
|
state.enabled = enabled
|
|
state.lastCheck = time.Now().Add(state.ttl) // suppress the next DB refresh
|
|
state.mu.Unlock()
|
|
}
|
|
|
|
// MaintenanceGin returns a Gin middleware for maintenance mode.
|
|
// Exempt paths: /health, /healthz, /readyz, /api/v1/health, /api/v1/admin, /swagger, /docs
|
|
func MaintenanceGin() gin.HandlerFunc {
|
|
return func(c *gin.Context) {
|
|
if !MaintenanceModeEnabled() {
|
|
c.Next()
|
|
return
|
|
}
|
|
path := c.Request.URL.Path
|
|
if isMaintenanceExempt(path) {
|
|
c.Next()
|
|
return
|
|
}
|
|
c.AbortWithStatusJSON(http.StatusServiceUnavailable, gin.H{"error": "Platform is under maintenance"})
|
|
}
|
|
}
|
|
|
|
func isMaintenanceExempt(path string) bool {
|
|
path = strings.TrimSuffix(path, "/")
|
|
exempts := []string{"/health", "/healthz", "/readyz", "/health/deep", "/metrics", "/swagger", "/docs", "/api/versions"}
|
|
for _, exempt := range exempts {
|
|
if path == exempt || strings.HasPrefix(path, exempt+"/") {
|
|
return true
|
|
}
|
|
}
|
|
if strings.Contains(path, "/api/v1/health") {
|
|
return true
|
|
}
|
|
if strings.Contains(path, "/api/v1/admin") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|