fix(chat,config): require REDIS_URL in prod + error on in-memory fallback
Two connected failure modes that silently break multi-pod deployments:
1. `RedisURL` has a struct-level default (`redis://<appDomain>:6379`)
that makes `c.RedisURL == ""` always false. An operator forgetting
to set `REDIS_URL` booted against a phantom host — every Redis call
would then fail, and `ChatPubSubService` would quietly fall back to
an in-memory map. On a single-pod deploy that "works"; on two pods
it silently partitions chat (messages on pod A never reach
subscribers on pod B).
2. The fallback itself was logged at `Warn` level, buried under normal
traffic. Operators only noticed when users reported stuck chats.
Changes:
* `config.go` (`ValidateForEnvironment` prod branch): new check that
`os.Getenv("REDIS_URL")` is non-empty. The struct field is left
alone (dev + test still use the default); we inspect the raw env so
the check is "explicitly set" rather than "non-empty after defaults".
* `chat_pubsub.go` `NewChatPubSubService`: if `redisClient == nil`,
emit an `ERROR` at construction time naming the failure mode
("cross-instance messages will be lost"). Same `Warn`→`Error`
promotion for the `Publish` fallback path — runbook-worthy.
Tests: new `chat_pubsub_test.go` with a `zaptest/observer` that asserts
the ERROR-level log fires exactly once when Redis is nil, plus an
in-memory fan-out happy-path so single-pod dev behaviour stays covered.
New `TestValidateForEnvironment_RedisURLRequiredInProduction` mirrors
the Hyperswitch guard test shape.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4538c6b281
commit
f80d46a153
4 changed files with 118 additions and 1 deletions
|
|
@ -868,6 +868,14 @@ func (c *Config) ValidateForEnvironment() error {
|
|||
return fmt.Errorf("HYPERSWITCH_ENABLED must be true in production. With payments disabled, marketplace orders complete without charging, effectively giving away products. Set HYPERSWITCH_ENABLED=true and configure HYPERSWITCH_API_KEY / HYPERSWITCH_WEBHOOK_SECRET")
|
||||
}
|
||||
|
||||
// 10. REDIS_URL must be *explicitly* set in production. The struct default
|
||||
// (redis://<appDomain>:6379) lets a misconfigured pod start up with
|
||||
// in-memory fallbacks — and in multi-pod deployments that silently
|
||||
// breaks cross-instance PubSub (chat, session revocation, etc.).
|
||||
if strings.TrimSpace(os.Getenv("REDIS_URL")) == "" {
|
||||
return fmt.Errorf("REDIS_URL must be explicitly set in production. A missing value lets the app boot against the default host and silently degrade to in-memory fallbacks that break cross-pod features")
|
||||
}
|
||||
|
||||
case EnvTest:
|
||||
// TEST: Validation adaptée aux tests
|
||||
// CORS peut être vide ou configuré explicitement
|
||||
|
|
|
|||
|
|
@ -504,6 +504,58 @@ func TestValidateForEnvironment_HyperswitchRequiredInProduction(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
// TestValidateForEnvironment_RedisURLRequiredInProduction verifies that an
|
||||
// unset REDIS_URL fails in production (v1.0.4 multi-pod fallback fix).
|
||||
func TestValidateForEnvironment_RedisURLRequiredInProduction(t *testing.T) {
|
||||
origClamAV := os.Getenv("CLAMAV_REQUIRED")
|
||||
origRedis := os.Getenv("REDIS_URL")
|
||||
defer func() {
|
||||
if origClamAV != "" {
|
||||
os.Setenv("CLAMAV_REQUIRED", origClamAV)
|
||||
} else {
|
||||
os.Unsetenv("CLAMAV_REQUIRED")
|
||||
}
|
||||
if origRedis != "" {
|
||||
os.Setenv("REDIS_URL", origRedis)
|
||||
} else {
|
||||
os.Unsetenv("REDIS_URL")
|
||||
}
|
||||
}()
|
||||
os.Setenv("CLAMAV_REQUIRED", "true")
|
||||
|
||||
cfg := &Config{
|
||||
Env: EnvProduction,
|
||||
AppPort: 8080,
|
||||
JWTSecret: strings.Repeat("a", 32),
|
||||
ChatJWTSecret: strings.Repeat("b", 32),
|
||||
OAuthEncryptionKey: strings.Repeat("c", 32),
|
||||
JWTIssuer: "veza-api",
|
||||
JWTAudience: "veza-platform",
|
||||
DatabaseURL: "postgresql://user:pass@localhost:5432/db",
|
||||
RedisURL: "redis://localhost:6379", // struct field is valid (default)
|
||||
RateLimitLimit: 100,
|
||||
RateLimitWindow: 60,
|
||||
CORSOrigins: []string{"https://example.com"},
|
||||
LogLevel: "INFO",
|
||||
HyperswitchEnabled: true,
|
||||
}
|
||||
logger, _ := zap.NewDevelopment()
|
||||
cfg.Logger = logger
|
||||
|
||||
t.Run("production with unset REDIS_URL env fails", func(t *testing.T) {
|
||||
os.Unsetenv("REDIS_URL")
|
||||
err := cfg.ValidateForEnvironment()
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "REDIS_URL must be explicitly set in production")
|
||||
})
|
||||
|
||||
t.Run("production with REDIS_URL env succeeds", func(t *testing.T) {
|
||||
os.Setenv("REDIS_URL", "redis://:password@prod-redis:6379")
|
||||
err := cfg.ValidateForEnvironment()
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
// TestValidateForEnvironment_ChatJWTSecretInProduction verifies CHAT_JWT_SECRET must differ from JWT_SECRET in production (v0.902)
|
||||
func TestValidateForEnvironment_ChatJWTSecretInProduction(t *testing.T) {
|
||||
secret := strings.Repeat("a", 32)
|
||||
|
|
|
|||
|
|
@ -20,6 +20,12 @@ func NewChatPubSubService(redisClient *redis.Client, logger *zap.Logger) *ChatPu
|
|||
if logger == nil {
|
||||
logger = zap.NewNop()
|
||||
}
|
||||
if redisClient == nil {
|
||||
// In multi-pod deployments the in-memory fallback silently breaks:
|
||||
// messages published on pod A are never seen by subscribers on pod B.
|
||||
// Emit a loud startup error so the misconfiguration is noticed.
|
||||
logger.Error("Redis unavailable, falling back to in-memory PubSub — cross-instance messages will be lost. Set REDIS_URL and restart for multi-pod correctness")
|
||||
}
|
||||
return &ChatPubSubService{
|
||||
redisClient: redisClient,
|
||||
logger: logger,
|
||||
|
|
@ -36,7 +42,13 @@ func (s *ChatPubSubService) Publish(ctx context.Context, roomID uuid.UUID, messa
|
|||
|
||||
if s.redisClient != nil {
|
||||
if err := s.redisClient.Publish(ctx, channel, message).Err(); err != nil {
|
||||
s.logger.Warn("Redis publish failed, using in-memory fallback", zap.Error(err))
|
||||
// ERROR, not Warn: the in-memory fallback only reaches subscribers
|
||||
// on this pod — a multi-pod chat becomes partitioned until Redis
|
||||
// recovers. Operators should page on this log line.
|
||||
s.logger.Error("Redis publish failed, in-memory fallback will not reach other pods",
|
||||
zap.String("channel", channel),
|
||||
zap.Error(err),
|
||||
)
|
||||
s.publishInMemory(channel, message)
|
||||
}
|
||||
return nil
|
||||
|
|
|
|||
45
veza-backend-api/internal/services/chat_pubsub_test.go
Normal file
45
veza-backend-api/internal/services/chat_pubsub_test.go
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
package services
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"go.uber.org/zap"
|
||||
"go.uber.org/zap/zapcore"
|
||||
"go.uber.org/zap/zaptest/observer"
|
||||
)
|
||||
|
||||
func TestChatPubSubService_NilRedisLogsError(t *testing.T) {
|
||||
core, observed := observer.New(zapcore.ErrorLevel)
|
||||
logger := zap.New(core)
|
||||
|
||||
_ = NewChatPubSubService(nil, logger)
|
||||
|
||||
entries := observed.All()
|
||||
assert.Len(t, entries, 1, "constructor should emit exactly one ERROR log when Redis is nil")
|
||||
assert.Equal(t, zapcore.ErrorLevel, entries[0].Level)
|
||||
assert.Contains(t, entries[0].Message, "cross-instance messages will be lost")
|
||||
}
|
||||
|
||||
func TestChatPubSubService_InMemoryFanout(t *testing.T) {
|
||||
svc := NewChatPubSubService(nil, zap.NewNop())
|
||||
|
||||
ctx := context.Background()
|
||||
roomID := uuid.New()
|
||||
|
||||
ch, cancel, err := svc.Subscribe(ctx, roomID)
|
||||
assert.NoError(t, err)
|
||||
defer cancel()
|
||||
|
||||
err = svc.Publish(ctx, roomID, []byte("hello"))
|
||||
assert.NoError(t, err)
|
||||
|
||||
select {
|
||||
case msg := <-ch:
|
||||
assert.Equal(t, "hello", string(msg))
|
||||
default:
|
||||
t.Fatal("expected message on in-memory channel")
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue