veza/veza-backend-api/internal/services/chat_pubsub.go
senke a36d9b2d59
Some checks failed
Veza CI / Backend (Go) (push) Failing after 8m56s
Veza CI / Frontend (Web) (push) Has been cancelled
E2E Playwright / e2e (full) (push) Has been cancelled
Veza CI / Notify on failure (push) Blocked by required conditions
Veza CI / Rust (Stream Server) (push) Successful in 5m3s
Security Scan / Secret Scanning (gitleaks) (push) Failing after 53s
feat(redis): Sentinel HA + cache hit rate metrics (W3 Day 11)
Three Incus containers, each running redis-server + redis-sentinel
(co-located). redis-1 = master at first boot, redis-2/3 = replicas.
Sentinel quorum=2 of 3 ; failover-timeout=30s satisfies the W3
acceptance criterion.

- internal/config/redis_init.go : initRedis branches on
  REDIS_SENTINEL_ADDRS ; non-empty -> redis.NewFailoverClient with
  MasterName + SentinelAddrs + SentinelPassword. Empty -> existing
  single-instance NewClient (dev/local stays parametric).
- internal/config/config.go : 3 new fields (RedisSentinelAddrs,
  RedisSentinelMasterName, RedisSentinelPassword) read from env.
  parseRedisSentinelAddrs trims+filters CSV.
- internal/metrics/cache_hit_rate.go : new RecordCacheHit / Miss
  counters, labelled by subsystem. Cardinality bounded.
- internal/middleware/rate_limiter.go : instrument 3 Eval call sites
  (DDoS, frontend log throttle, upload throttle). Hit = Redis answered,
  Miss = error -> in-memory fallback.
- internal/services/chat_pubsub.go : instrument Publish + PublishPresence.
- internal/websocket/chat/presence_service.go : instrument SetOnline /
  SetOffline / Heartbeat / GetPresence. redis.Nil counts as a hit
  (legitimate empty result).
- infra/ansible/roles/redis_sentinel/ : install Redis 7 + Sentinel,
  render redis.conf + sentinel.conf, systemd units. Vault assertion
  prevents shipping placeholder passwords to staging/prod.
- infra/ansible/playbooks/redis_sentinel.yml : provisions the 3
  containers + applies common baseline + role.
- infra/ansible/inventory/lab.yml : new groups redis_ha + redis_ha_master.
- infra/ansible/tests/test_redis_failover.sh : kills the master
  container, polls Sentinel for the new master, asserts elapsed < 30s.
- config/grafana/dashboards/redis-cache-overview.json : 3 hit-rate
  stats (rate_limiter / chat_pubsub / presence) + ops/s breakdown.
- docs/ENV_VARIABLES.md §3 : 3 new REDIS_SENTINEL_* env vars.
- veza-backend-api/.env.template : 3 placeholders (empty default).

Acceptance (Day 11) : Sentinel failover < 30s ; cache hit-rate
dashboard populated. Lab test pending Sentinel deployment.

W3 verification gate progress : Redis Sentinel ✓ (this commit),
MinIO EC4+2  Day 12, CDN  Day 13, DMCA  Day 14, embed  Day 15.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 13:36:55 +02:00

184 lines
4.3 KiB
Go

package services
import (
"context"
"sync"
"veza-backend-api/internal/metrics"
"github.com/google/uuid"
"github.com/redis/go-redis/v9"
"go.uber.org/zap"
)
type ChatPubSubService struct {
redisClient *redis.Client
logger *zap.Logger
inMemorySubscribers map[string][]chan []byte
mu sync.RWMutex
}
func NewChatPubSubService(redisClient *redis.Client, logger *zap.Logger) *ChatPubSubService {
if logger == nil {
logger = zap.NewNop()
}
if redisClient == nil {
// In multi-pod deployments the in-memory fallback silently breaks:
// messages published on pod A are never seen by subscribers on pod B.
// Emit a loud startup error so the misconfiguration is noticed.
logger.Error("Redis unavailable, falling back to in-memory PubSub — cross-instance messages will be lost. Set REDIS_URL and restart for multi-pod correctness")
}
return &ChatPubSubService{
redisClient: redisClient,
logger: logger,
inMemorySubscribers: make(map[string][]chan []byte),
}
}
func (s *ChatPubSubService) roomChannel(roomID uuid.UUID) string {
return "chat:room:" + roomID.String()
}
func (s *ChatPubSubService) Publish(ctx context.Context, roomID uuid.UUID, message []byte) error {
channel := s.roomChannel(roomID)
if s.redisClient != nil {
if err := s.redisClient.Publish(ctx, channel, message).Err(); err != nil {
metrics.RecordCacheMiss("chat_pubsub")
// ERROR, not Warn: the in-memory fallback only reaches subscribers
// on this pod — a multi-pod chat becomes partitioned until Redis
// recovers. Operators should page on this log line.
s.logger.Error("Redis publish failed, in-memory fallback will not reach other pods",
zap.String("channel", channel),
zap.Error(err),
)
s.publishInMemory(channel, message)
return nil
}
metrics.RecordCacheHit("chat_pubsub")
return nil
}
metrics.RecordCacheMiss("chat_pubsub")
s.publishInMemory(channel, message)
return nil
}
func (s *ChatPubSubService) Subscribe(ctx context.Context, roomID uuid.UUID) (<-chan []byte, func(), error) {
channel := s.roomChannel(roomID)
if s.redisClient != nil {
pubsub := s.redisClient.Subscribe(ctx, channel)
ch := make(chan []byte, 256)
go func() {
defer close(ch)
for {
msg, err := pubsub.ReceiveMessage(ctx)
if err != nil {
return
}
select {
case ch <- []byte(msg.Payload):
default:
s.logger.Warn("PubSub channel full, dropping message", zap.String("channel", channel))
}
}
}()
cancel := func() {
_ = pubsub.Close()
}
return ch, cancel, nil
}
return s.subscribeInMemory(channel)
}
func (s *ChatPubSubService) PublishPresence(ctx context.Context, event []byte) error {
channel := "chat:presence"
if s.redisClient != nil {
if err := s.redisClient.Publish(ctx, channel, event).Err(); err != nil {
metrics.RecordCacheMiss("chat_pubsub")
return err
}
metrics.RecordCacheHit("chat_pubsub")
return nil
}
metrics.RecordCacheMiss("chat_pubsub")
s.publishInMemory(channel, event)
return nil
}
func (s *ChatPubSubService) SubscribePresence(ctx context.Context) (<-chan []byte, func(), error) {
channel := "chat:presence"
if s.redisClient != nil {
pubsub := s.redisClient.Subscribe(ctx, channel)
ch := make(chan []byte, 256)
go func() {
defer close(ch)
for {
msg, err := pubsub.ReceiveMessage(ctx)
if err != nil {
return
}
select {
case ch <- []byte(msg.Payload):
default:
}
}
}()
cancel := func() {
_ = pubsub.Close()
}
return ch, cancel, nil
}
return s.subscribeInMemory(channel)
}
func (s *ChatPubSubService) publishInMemory(channel string, message []byte) {
s.mu.RLock()
defer s.mu.RUnlock()
subscribers, ok := s.inMemorySubscribers[channel]
if !ok {
return
}
for _, ch := range subscribers {
select {
case ch <- message:
default:
}
}
}
func (s *ChatPubSubService) subscribeInMemory(channel string) (<-chan []byte, func(), error) {
ch := make(chan []byte, 256)
s.mu.Lock()
s.inMemorySubscribers[channel] = append(s.inMemorySubscribers[channel], ch)
s.mu.Unlock()
cancel := func() {
s.mu.Lock()
defer s.mu.Unlock()
subs := s.inMemorySubscribers[channel]
for i, sub := range subs {
if sub == ch {
s.inMemorySubscribers[channel] = append(subs[:i], subs[i+1:]...)
break
}
}
close(ch)
}
return ch, cancel, nil
}