veza/veza-backend-api/cmd/api/main.go

package main

import (
	"context"
	"fmt"
	"log"
	"net/http"

	// SECURITY(REM-027): pprof removed from production — use build tag or dedicated debug binary instead.
	// To enable: go build -tags debug ./cmd/api
	"os"
	"os/signal"
	"syscall"
	"time"

	"github.com/getsentry/sentry-go"
	"github.com/gin-gonic/gin"
	"github.com/joho/godotenv"
	"go.uber.org/zap"

	"veza-backend-api/internal/api"
	"veza-backend-api/internal/config"
	"veza-backend-api/internal/core/marketplace"
	vezaes "veza-backend-api/internal/elasticsearch"
	"veza-backend-api/internal/jobs"
	"veza-backend-api/internal/metrics"
	"veza-backend-api/internal/monitoring"
	"veza-backend-api/internal/services"
	"veza-backend-api/internal/services/hyperswitch"
	"veza-backend-api/internal/shutdown"
	"veza-backend-api/internal/tracing"
	"veza-backend-api/internal/workers"

	_ "veza-backend-api/docs" // Import docs for swagger
)

// @title           Veza Backend API
// @version         1.2.0
// @description     Backend API for Veza platform.
// @termsOfService  http://swagger.io/terms/

// @contact.name    API Support
// @contact.url     https://veza.fr/support
// @contact.email   support@veza.fr

// @license.name    Apache 2.0
// @license.url     http://www.apache.org/licenses/LICENSE-2.0.html

// @host            localhost:18080
// @BasePath        /api/v1

// @securityDefinitions.apikey BearerAuth
// @in header
// @name Authorization

// @securityDefinitions.apikey ApiKeyAuth
// @in header
// @name X-API-Key
// @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx

// appVersion is overridden at build time via
// `-ldflags "-X main.appVersion=vX.Y.Z"`. Used as the OTel resource
// attribute service.version + Sentry release tag.
var appVersion = "dev"

func main() {
	// Charger les variables d'environnement
	// NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald
	// The message will be logged by the logger once it's initialized
	_ = godotenv.Load()

	// FIX #1: Supprimer l'initialisation dupliquée du logger
	// Le logger sera initialisé dans config.NewConfig() avec le bon LOG_LEVEL
	// Charger la configuration (qui initialise le logger)
	cfg, err := config.NewConfig()
	if err != nil {
		// CRITICAL: Do not write to stderr or files to avoid broken pipe errors
		// Just exit silently - systemd will capture the exit code
		// The error details will be in the application logs if the logger was initialized
		os.Exit(1)
	}

	// Utiliser le logger de la config
	logger := cfg.Logger
	if logger == nil {
		log.Fatal("❌ Logger non initialisé dans la configuration")
	}

	logger.Info("🚀 Démarrage de Veza Backend API")

	// Valider la configuration
	if err := cfg.Validate(); err != nil {
		logger.Fatal("❌ Configuration invalide", zap.Error(err))
	}

	// Initialiser Sentry si DSN configuré
	if cfg.SentryDsn != "" {
		err := sentry.Init(sentry.ClientOptions{
			Dsn:              cfg.SentryDsn,
			Environment:      cfg.SentryEnvironment,
			TracesSampleRate: cfg.SentrySampleRateTransactions,
			SampleRate:       cfg.SentrySampleRateErrors,
			// AttachStacktrace pour capturer les stack traces
			AttachStacktrace: true,
		})
		if err != nil {
			logger.Warn("❌ Impossible d'initialiser Sentry", zap.Error(err))
		} else {
			logger.Info("✅ Sentry initialisé", zap.String("environment", cfg.SentryEnvironment))
		}
		// Flush les événements Sentry avant shutdown
		defer sentry.Flush(2 * time.Second)
	} else {
		logger.Info("ℹ️ Sentry non configuré (SENTRY_DSN non défini)")
	}

	// v1.0.9 Day 9 — OpenTelemetry tracer init. Spans flow to the
	// otel-collector container (provisioned by infra/ansible/roles/
	// otel_collector) which forwards them to Tempo. Disabled in
	// dev / unit tests via OTEL_SDK_DISABLED=true to keep the
	// process from background-dialing localhost:4317.
	tracerCtx, tracerCancel := context.WithTimeout(context.Background(), 10*time.Second)
	// AppVersion drawn from build-time ldflag; falls back to "dev" so
	// the resource attribute is always populated. Set via:
	//   go build -ldflags "-X main.appVersion=v1.0.9" ./cmd/api
	tracerProvider, err := tracing.InitOTLPTracer(tracerCtx, cfg.Env, appVersion, logger)
	tracerCancel()
	if err != nil {
		// Tracing failure is operational, not fatal. The collector
		// could be starting up at the same time as the backend; the
		// exporter retries internally.
		logger.Warn("OTel tracer init failed — continuing without spans", zap.Error(err))
	}
	defer func() {
		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
		defer cancel()
		_ = tracerProvider.Shutdown(shutdownCtx)
	}()

	// Initialisation de la base de données
	db := cfg.Database
	if db == nil {
		logger.Fatal("❌ Base de données non initialisée")
	}
	defer db.Close()

	if err := db.Initialize(); err != nil {
		logger.Fatal("❌ Impossible d'initialiser la base de données", zap.Error(err))
	}

	// MOD-P2-004: Démarrer le collecteur de métriques DB pool
	// Collecte les stats DB pool toutes les 10 secondes et les expose via Prometheus
	metrics.StartDBPoolStatsCollector(db.DB, 10*time.Second)
	logger.Info("✅ Collecteur de métriques DB pool démarré")

	// Fail-Fast: Vérifier RabbitMQ si activé
	if cfg.RabbitMQEnable {
		if cfg.RabbitMQEventBus == nil {
			logger.Fatal("❌ RabbitMQ activé (RABBITMQ_ENABLE=true) mais non initialisé (problème de connexion?)")
		} else {
			// Optionnel: Check connection status if RabbitMQEventBus exposes it
			// For now, assume if initialized it's connected or retrying.
			// If we want STRICT fail fast, we would need to verify connection is Open here.
			logger.Info("✅ RabbitMQ actif")
		}
	} else {
		logger.Info("ℹ️ RabbitMQ désactivé")
	}

	// BE-SVC-017: Créer le gestionnaire de shutdown gracieux
	shutdownManager := shutdown.NewShutdownManager(logger)

	// Démarrer le Job Worker avec contexte pour shutdown gracieux
	var workerCtx context.Context
	var workerCancel context.CancelFunc
	if cfg.JobWorker != nil {
		workerCtx, workerCancel = context.WithCancel(context.Background())
		cfg.JobWorker.Start(workerCtx)
		logger.Info("✅ Job Worker démarré")

		// Enregistrer le Job Worker pour shutdown gracieux
		shutdownManager.Register(shutdown.NewShutdownFunc("job_worker", func(ctx context.Context) error {
			if workerCancel != nil {
				workerCancel()
				// Attendre un peu pour que les workers se terminent
				time.Sleep(2 * time.Second)
			}
			return nil
		}))
	} else {
		logger.Warn("⚠️ Job Worker non initialisé")
	}

	// v0.701: Start Transfer Retry Worker
	// v1.0.7 item B: Start Reversal Worker (shares the same
	// StripeConnectService — one initialisation for both workers).
	if cfg.StripeConnectEnabled && cfg.StripeConnectSecretKey != "" {
		stripeConnectSvc := services.NewStripeConnectService(db.GormDB, cfg.StripeConnectSecretKey, logger)

		if cfg.TransferRetryEnabled {
			retryWorker := marketplace.NewTransferRetryWorker(
				db.GormDB, stripeConnectSvc, logger, cfg.TransferRetryInterval, cfg.TransferRetryMaxAttempts,
			)
			retryCtx, retryCancel := context.WithCancel(context.Background())
			go retryWorker.Start(retryCtx)
			logger.Info("Transfer Retry Worker started",
				zap.Duration("interval", cfg.TransferRetryInterval),
				zap.Int("max_retries", cfg.TransferRetryMaxAttempts))

			shutdownManager.Register(shutdown.NewShutdownFunc("transfer_retry_worker", func(ctx context.Context) error {
				retryCancel()
				return nil
			}))
		}

		if cfg.ReversalWorkerEnabled {
			reversalWorker := marketplace.NewStripeReversalWorker(
				db.GormDB, stripeConnectSvc, logger,
				cfg.ReversalCheckInterval, cfg.ReversalMaxRetries,
				cfg.ReversalBackoffBase, cfg.ReversalBackoffMax,
			)
			reversalCtx, reversalCancel := context.WithCancel(context.Background())
			go reversalWorker.Start(reversalCtx)
			logger.Info("Stripe Reversal Worker started",
				zap.Duration("interval", cfg.ReversalCheckInterval),
				zap.Int("max_retries", cfg.ReversalMaxRetries),
				zap.Duration("backoff_base", cfg.ReversalBackoffBase),
				zap.Duration("backoff_max", cfg.ReversalBackoffMax))

			shutdownManager.Register(shutdown.NewShutdownFunc("stripe_reversal_worker", func(ctx context.Context) error {
				reversalCancel()
				return nil
			}))
		}
	} else if cfg.TransferRetryEnabled || cfg.ReversalWorkerEnabled {
		logger.Info("Transfer Retry / Reversal workers skipped — Stripe Connect not enabled")
	}

	// v1.0.7 item C: Reconciliation worker for stuck pending orders /
	// refunds whose webhook never arrived. Gated on Hyperswitch being
	// configured — without PSP read access there's nothing to sync
	// against.
	if cfg.ReconcileWorkerEnabled && cfg.HyperswitchEnabled && cfg.HyperswitchAPIKey != "" && cfg.HyperswitchURL != "" {
		hsClient := hyperswitch.NewClient(cfg.HyperswitchURL, cfg.HyperswitchAPIKey)
		hsProvider := hyperswitch.NewProvider(hsClient)
		// Build a marketplace.Service for the dispatcher side. Scoped
		// to the worker — the HTTP handler constructs its own via
		// APIRouter.getMarketplaceService which wires additional opts
		// (storage, checkout URL). Reconciler only needs the two
		// Process*Webhook methods.
		mktSvc := marketplace.NewService(db.GormDB, logger, nil,
			marketplace.WithPaymentProvider(hsProvider),
			marketplace.WithHyperswitchConfig(true, cfg.CheckoutSuccessURL),
		)
		reconcileWorker := marketplace.NewReconcileHyperswitchWorker(
			db.GormDB, hsProvider, mktSvc, logger,
			cfg.ReconcileInterval,
			cfg.ReconcileOrderStuckAfter,
			cfg.ReconcileRefundStuckAfter,
			cfg.ReconcileRefundOrphanAfter,
		)
		reconcileCtx, reconcileCancel := context.WithCancel(context.Background())
		go reconcileWorker.Start(reconcileCtx)
		logger.Info("Reconcile Hyperswitch Worker started",
			zap.Duration("interval", cfg.ReconcileInterval),
			zap.Duration("order_stuck_after", cfg.ReconcileOrderStuckAfter),
			zap.Duration("refund_stuck_after", cfg.ReconcileRefundStuckAfter),
			zap.Duration("refund_orphan_after", cfg.ReconcileRefundOrphanAfter))
		shutdownManager.Register(shutdown.NewShutdownFunc("reconcile_hyperswitch_worker", func(ctx context.Context) error {
			reconcileCancel()
			return nil
		}))
	} else if cfg.ReconcileWorkerEnabled {
		logger.Info("Reconcile worker skipped — Hyperswitch not enabled")
	}

	// v0.802: Start Cloud Backup Worker (copies cloud files to backup prefix every 24h)
	if cfg.S3StorageService != nil {
		backupWorker := services.NewCloudBackupWorker(db.GormDB, cfg.S3StorageService, logger)
		backupCtx, backupCancel := context.WithCancel(context.Background())
		go backupWorker.Start(backupCtx)
		logger.Info("Cloud Backup Worker started (24h interval)")

		shutdownManager.Register(shutdown.NewShutdownFunc("cloud_backup_worker", func(ctx context.Context) error {
			backupCancel()
			return nil
		}))
	}

	// v0.802: Start Gear Warranty Notifier (sends notifications when warranty expires in 30 days)
	notificationService := services.NewNotificationService(db, logger)
	warrantyNotifier := services.NewGearWarrantyNotifier(db.GormDB, notificationService, logger)
	warrantyCtx, warrantyCancel := context.WithCancel(context.Background())
	go warrantyNotifier.Start(warrantyCtx)
	logger.Info("Gear Warranty Notifier started (24h interval)")

	shutdownManager.Register(shutdown.NewShutdownFunc("gear_warranty_notifier", func(ctx context.Context) error {
		warrantyCancel()
		return nil
	}))

	// v0.10.5 F552: Weekly notification digest (runs on Sunday)
	if cfg.JobWorker != nil {
		digestWorker := services.NewNotificationDigestWorker(db.GormDB, cfg.JobWorker, logger)
		digestCtx, digestCancel := context.WithCancel(context.Background())
		go digestWorker.Start(digestCtx)
		logger.Info("Notification digest worker started (weekly on Sunday)")

		shutdownManager.Register(shutdown.NewShutdownFunc("notification_digest_worker", func(ctx context.Context) error {
			digestCancel()
			return nil
		}))
	}

	// v0.10.8 F065: Hard delete worker (GDPR - final anonymization after 30 days)
	if os.Getenv("HARD_DELETE_CRON_ENABLED") != "false" {
		// Optional ES client for the worker's RGPD cleanup (users/tracks/playlists indices).
		// Non-fatal if ES is disabled or unreachable — the worker will just skip ES cleanup.
		var hardDeleteESClient *vezaes.Client
		if esCfg := vezaes.LoadConfig(); esCfg.Enabled {
			if esc, esErr := vezaes.NewClient(esCfg, logger); esErr != nil {
				logger.Warn("Elasticsearch unavailable for hard delete worker, ES cleanup disabled",
					zap.Error(esErr))
			} else {
				hardDeleteESClient = esc
			}
		}
		hardDeleteWorker := workers.NewHardDeleteWorker(db.GormDB, logger, 24*time.Hour).
			WithRedis(cfg.RedisClient).
			WithElasticsearch(hardDeleteESClient)
		hardDeleteCtx, hardDeleteCancel := context.WithCancel(context.Background())
		go hardDeleteWorker.Start(hardDeleteCtx)
		logger.Info("Hard delete worker started (24h interval)",
			zap.Bool("redis_cleanup", cfg.RedisClient != nil),
			zap.Bool("es_cleanup", hardDeleteESClient != nil),
		)

		shutdownManager.Register(shutdown.NewShutdownFunc("hard_delete_worker", func(ctx context.Context) error {
			hardDeleteWorker.Stop()
			hardDeleteCancel()
			return nil
		}))
	} else {
		logger.Info("Hard delete worker disabled (HARD_DELETE_CRON_ENABLED=false)")
	}

	// Configuration du mode Gin
	// Correction: Utilisation directe de la variable d'env car non exposée dans Config
	appEnv := os.Getenv("APP_ENV")
	if appEnv == "production" {
		gin.SetMode(gin.ReleaseMode)
	} else {
		gin.SetMode(gin.DebugMode)
	}

	// Créer le router Gin
	router := gin.New()

	// SECURITY(HIGH-006): Restrict trusted proxies to prevent IP spoofing via X-Forwarded-For.
	// Default: trust nothing (c.ClientIP() returns RemoteAddr only).
	// Set TRUSTED_PROXIES="10.0.0.1,10.0.0.2" if behind a known reverse proxy/load balancer.
	router.SetTrustedProxies(nil)

	// Middleware globaux (Logger, Recovery) recommandés par ORIGIN
	router.Use(gin.Logger(), gin.Recovery())

	// Configuration des routes
	apiRouter := api.NewAPIRouter(db, cfg) // Instantiate APIRouter
	if err := apiRouter.Setup(router); err != nil {
		logger.Error("Failed to setup API routes", zap.Error(err))
		os.Exit(1)
	}

	// v1.0.4: Hourly cleanup of tracks stuck in `processing` whose upload file
	// vanished (crash, SIGKILL, disk wipe). Keeps the tracks table honest.
	jobs.ScheduleOrphanTracksCleanup(db, logger)

	// v1.0.7 item E: daily sweep of hyperswitch_webhook_log rows older than
	// HYPERSWITCH_WEBHOOK_LOG_RETENTION_DAYS (default 90). Batched so a large
	// backlog doesn't lock the table.
	jobs.ScheduleHyperswitchWebhookLogCleanup(db, logger, cfg.HyperswitchWebhookLogRetentionDays)

	// v1.0.7 item F: 60s sampler feeds five ledger-health gauges +
	// reconciler_* counters. Grafana dashboard in config/grafana/ledger.json,
	// alert rules in config/alertmanager/ledger.yml.
	ledgerSamplerCtx, ledgerSamplerCancel := context.WithCancel(context.Background())
	monitoring.ScheduleLedgerHealthSampler(ledgerSamplerCtx, db.GormDB, logger)
	shutdownManager.Register(shutdown.NewShutdownFunc("ledger_health_sampler", func(ctx context.Context) error {
		ledgerSamplerCancel()
		return nil
	}))

	// Configuration du serveur HTTP
	port := fmt.Sprintf("%d", cfg.AppPort)
	if cfg.AppPort == 0 {
		port = "8080"
	}

	server := &http.Server{
		Addr:         fmt.Sprintf(":%s", port),
		Handler:      router,
		ReadTimeout:  30 * time.Second, // Standards ORIGIN
		WriteTimeout: 30 * time.Second,
	}

	// BE-SVC-017: Enregistrer tous les services pour shutdown gracieux
	// Enregistrer le serveur HTTP
	shutdownManager.Register(shutdown.NewShutdownFunc("http_server", func(ctx context.Context) error {
		return server.Shutdown(ctx)
	}))

	// Enregistrer la configuration (ferme DB, Redis, RabbitMQ, etc.)
	shutdownManager.Register(shutdown.NewShutdownFunc("config", func(ctx context.Context) error {
		return cfg.Close()
	}))

	// Enregistrer le logger pour flush final
	shutdownManager.Register(shutdown.NewShutdownFunc("logger", func(ctx context.Context) error {
		if logger != nil {
			return logger.Sync()
		}
		return nil
	}))

	// Enregistrer Sentry pour flush final
	if cfg.SentryDsn != "" {
		shutdownManager.Register(shutdown.NewShutdownFunc("sentry", func(ctx context.Context) error {
			sentry.Flush(2 * time.Second)
			return nil
		}))
	}

	// Gestion de l'arrêt gracieux
	quit := make(chan os.Signal, 1)
	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)

	go func() {
		logger.Info("🌐 Serveur HTTP démarré", zap.String("port", port))
		if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
			logger.Fatal("❌ Erreur du serveur HTTP", zap.Error(err))
		}
	}()

	// Attendre le signal d'arrêt
	<-quit
	logger.Info("🔄 Signal d'arrêt reçu, démarrage du shutdown gracieux...")

	// BE-SVC-017: Arrêt gracieux coordonné de tous les services
	shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer shutdownCancel()

	if err := shutdownManager.Shutdown(shutdownCtx); err != nil {
		logger.Error("❌ Erreur lors du shutdown gracieux", zap.Error(err))
	} else {
		logger.Info("✅ Shutdown gracieux terminé avec succès")
	}
}
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+								package main
 								import (
 									"context"
 									"fmt"
 									"log"
 									"net/http"
-												ci: bump Go to 1.25 and fix goimports drift in 3 files

golangci-lint v2.11.4 requires Go >= 1.25. With the workflow on 1.24,
setup-go would silently trigger an in-job auto-toolchain download
(observed in run #71: 'go: github.com/golangci/golangci-lint/v2@v2.11.4
requires go >= 1.25.0; switching to go1.25.9') adding ~3 min to every
Backend (Go) run.

Bump setup-go to 1.25 in ci.yml, backend-ci.yml, go-fuzz.yml so the
prebuilt Go is already the right version.

Also lint-fix three files that golangci-lint's goimports checker
flagged — goimports sorts/groups imports and removes unused ones,
which plain gofmt leaves alone:
  - veza-backend-api/cmd/api/main.go
  - veza-backend-api/internal/api/handlers/chat_handlers.go
  - veza-backend-api/internal/handlers/auth_integration_test.go

											
										
										
											2026-04-14 15:02:09 +00:00
-												fix(v0.12.6): apply all pentest remediations — 36 findings across 36 files

CRITICAL fixes:
- Race condition (TOCTOU) in payout/refund with SELECT FOR UPDATE (CRITICAL-001/002)
- IDOR on analytics endpoint — ownership check enforced (CRITICAL-003)
- CSWSH on all WebSocket endpoints — origin whitelist (CRITICAL-004)
- Mass assignment on user self-update — strip privileged fields (CRITICAL-005)

HIGH fixes:
- Path traversal in marketplace upload — UUID filenames (HIGH-001)
- IP spoofing — use Gin trusted proxy c.ClientIP() (HIGH-002)
- Popularity metrics (followers, likes) set to json:"-" (HIGH-003)
- bcrypt cost hardened to 12 everywhere (HIGH-004)
- Refresh token lock made mandatory (HIGH-005)
- Stream token replay prevention with access_count (HIGH-006)
- Subscription trial race condition fixed (HIGH-007)
- License download expiration check (HIGH-008)
- Webhook amount validation (HIGH-009)
- pprof endpoint removed from production (HIGH-010)

MEDIUM fixes:
- WebSocket message size limit 64KB (MEDIUM-010)
- HSTS header in nginx production (MEDIUM-001)
- CORS origin restricted in nginx-rtmp (MEDIUM-002)
- Docker alpine pinned to 3.21 (MEDIUM-003/004)
- Redis authentication enforced (MEDIUM-005)
- GDPR account deletion expanded (MEDIUM-006)
- .gitignore hardened (MEDIUM-007)

LOW/INFO fixes:
- GitHub Actions SHA pinning on all workflows (LOW-001)
- .env.example security documentation (INFO-001)
- Production CORS set to HTTPS (LOW-002)

All tests pass. Go and Rust compile clean.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-13 23:44:46 +00:00
+									// SECURITY(REM-027): pprof removed from production — use build tag or dedicated debug binary instead.
 									// To enable: go build -tags debug ./cmd/api
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									"os"
 									"os/signal"
 									"syscall"
 									"time"
-												P0: stabilisation backend/chat/stream + nouvelle base migrations v1

Backend Go:
- Remplacement complet des anciennes migrations par la base V1 alignée sur ORIGIN.
- Durcissement global du parsing JSON (BindAndValidateJSON + RespondWithAppError).
- Sécurisation de config.go, CORS, statuts de santé et monitoring.
- Implémentation des transactions P0 (RBAC, duplication de playlists, social toggles).
- Ajout d’un job worker structuré (emails, analytics, thumbnails) + tests associés.
- Nouvelle doc backend : AUDIT_CONFIG, BACKEND_CONFIG, AUTH_PASSWORD_RESET, JOB_WORKER_*.

Chat server (Rust):
- Refonte du pipeline JWT + sécurité, audit et rate limiting avancé.
- Implémentation complète du cycle de message (read receipts, delivered, edit/delete, typing).
- Nettoyage des panics, gestion d’erreurs robuste, logs structurés.
- Migrations chat alignées sur le schéma UUID et nouvelles features.

Stream server (Rust):
- Refonte du moteur de streaming (encoding pipeline + HLS) et des modules core.
- Transactions P0 pour les jobs et segments, garanties d’atomicité.
- Documentation détaillée de la pipeline (AUDIT_STREAM_*, DESIGN_STREAM_PIPELINE, TRANSACTIONS_P0_IMPLEMENTATION).

Documentation & audits:
- TRIAGE.md et AUDIT_STABILITY.md à jour avec l’état réel des 3 services.
- Cartographie complète des migrations et des transactions (DB_MIGRATIONS_*, DB_TRANSACTION_PLAN, AUDIT_DB_TRANSACTIONS, TRANSACTION_TESTS_PHASE3).
- Scripts de reset et de cleanup pour la lab DB et la V1.

Ce commit fige l’ensemble du travail de stabilisation P0 (UUID, backend, chat et stream) avant les phases suivantes (Coherence Guardian, WS hardening, etc.).

											
										
										
											2025-12-06 10:14:38 +00:00
+									"github.com/getsentry/sentry-go"
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									"github.com/gin-gonic/gin"
 									"github.com/joho/godotenv"
 									"go.uber.org/zap"
 									"veza-backend-api/internal/api"
 									"veza-backend-api/internal/config"
-												feat(server): start TransferRetryWorker on boot (v0.701)

											
										
										
											2026-02-23 22:32:23 +00:00
+									"veza-backend-api/internal/core/marketplace"
-												fix(backend): J4 — GDPR-compliant hard delete with Redis and ES cleanup

Closes TODO(HIGH-007). When the hard-delete worker anonymizes a user past
their recovery deadline, it now also cleans the user's residual data from
Redis and Elasticsearch, not just PostgreSQL. Without this, a user who
invoked their right to erasure would still appear in cached feed/profile
responses and in ES search results for up to the next reindex cycle.

Worker changes (internal/workers/hard_delete_worker.go):

  WithRedis / WithElasticsearch builder methods inject the clients. Both
  are optional: if either is nil (feature disabled or unreachable), the
  corresponding cleanup is skipped with a debug log and the worker keeps
  going. Partial progress beats panic.

  cleanRedisKeys uses SCAN with a cursor loop (COUNT 100), NEVER KEYS —
  KEYS would block the Redis server on multi-million-key deployments.
  Pattern is user:{id}:*. Transient SCAN errors retry up to 3 times with
  100ms * retry linear backoff; persistent errors return without panic.
  DEL errors on a batch are logged but non-fatal so subsequent batches
  are still attempted.

  cleanESDocs hits three indices independently:
    - users index: DELETE doc by _id (the user UUID); 404 treated as
      success (already gone = desired state)
    - tracks index: DeleteByQuery with a terms filter on _id, using the
      list of track IDs collected from PostgreSQL BEFORE anonymization
    - playlists index: same pattern as tracks
  A failure on one index does not prevent the others from being tried;
  the first error is returned so the caller can log.

  Track/playlist IDs are pre-collected (collectTrackIDs, collectPlaylistIDs)
  before the UPDATE anonymization runs, because the anonymization does NOT
  cascade (no DELETE on users), so tracks and playlists rows remain with
  their creator_id / user_id intact and resolvable at query time.

Wiring (cmd/api/main.go):

  The worker now receives cfg.RedisClient directly, and an optional ES
  client built from elasticsearch.LoadConfig() + NewClient. If ES is
  disabled or unreachable at startup, the worker logs a warning and
  proceeds with Redis-only cleanup.

Tests (internal/workers/hard_delete_worker_test.go, +260 lines):

  Pure-function unit tests:
    - TestUUIDsToStrings
    - TestEsIndexNameFor
  Nil-client safety tests:
    - TestCleanRedisKeys_NilClientIsNoop
    - TestCleanESDocs_NilClientIsNoop
  ES mock-server tests (httptest.Server mimicking /_doc and
  /_delete_by_query endpoints with valid ES 8.11 responses):
    - TestCleanESDocs_CallsAllThreeIndices — verifies the three expected
      HTTP calls land with the right paths and request bodies containing
      the provided UUIDs
    - TestCleanESDocs_SkipsEmptyIDLists — verifies no DeleteByQuery is
      issued when the ID lists are empty
  Redis testcontainer integration test (gated by VEZA_SKIP_INTEGRATION):
    - TestCleanRedisKeys_Integration — seeds 154 keys (4 fixed + 150 bulk
      to force the SCAN loop past a single batch) plus 4 unrelated keys
      from another user / global, runs cleanRedisKeys, asserts all 154
      own keys are gone and all 4 unrelated keys remain.

Verification:
  go build ./...                                                OK
  go vet ./...                                                  OK
  VEZA_SKIP_INTEGRATION=1 go test ./internal/workers/... short  OK
  go test ./internal/workers/ -run TestCleanRedisKeys_Integration
    → testcontainers spins redis:7-alpine, test passes in 1.34s

Out of J4 scope (noted for a follow-up):
  - No "activity" ES index exists in the codebase today (the audit plan
    mentioned it as a possible target). The three real indices with user
    data — users, tracks, playlists — are all now cleaned.
  - Track artist strings (free-form) may still contain the user's
    display name as a cached value in the tracks index after this
    cleanup. Actual user-owned tracks are deleted here, but if a third
    party's track referenced the removed user in its artist field, that
    reference is not touched. Strict RGPD on that edge case is a
    separate ticket.

Refs: AUDIT_REPORT.md §8.5, §10 P5, §12 item 1

											
										
										
											2026-04-15 10:25:39 +00:00
+									vezaes "veza-backend-api/internal/elasticsearch"
-												feat(workers): hourly cleanup of orphan tracks stuck in processing

Upload flow: POST creates a track row with `status=processing` and
writes the file at `file_path`. If the uploader process dies (OOM,
SIGKILL during deploy, disk wipe) between row-create and status-update,
the row stays in `processing` forever with a `file_path` that doesn't
exist. The library UI shows a ghost track the user can never play,
never reach, and only partially delete.

New worker:

  * `jobs/cleanup_orphan_tracks.go` — `CleanupOrphanTracks` queries
    tracks with `status=processing AND created_at < NOW()-1h`, stats
    the `file_path`, and flips the row to `status=failed` with
    `status_message = "orphan cleanup: file missing on disk after >1h
    in processing"`. Never deletes; never touches present files or
    rows already in another state. Safe to run repeatedly.
  * `ScheduleOrphanTracksCleanup(db, logger)` runs once at boot and
    then every hour thereafter. Wired in `cmd/api/main.go` right after
    route setup so restarts trigger an immediate scan.
  * Threshold exported as `OrphanTrackAgeThreshold` constant so tests
    and future tuning don't need to edit the worker.

Tests: 5 cases in `cleanup_orphan_tracks_test.go`:
  - `_FlipsStuckMissingFile` happy path
  - `_LeavesFilePresent` (slow uploads must not be failed)
  - `_LeavesRecent` (below threshold)
  - `_IgnoresAlreadyFailed` (idempotent)
  - `_NilDatabaseIsNoop` (safety)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-16 12:57:24 +00:00
+									"veza-backend-api/internal/jobs"
-												refonte: backend-api go first; phase 1

											
										
										
											2025-12-13 02:34:34 +00:00
+									"veza-backend-api/internal/metrics"
-												feat(metrics): ledger-health gauges + alert rules — v1.0.7 item F

Five Prometheus gauges + reconciler metrics + Grafana dashboard +
three alert rules. Closes axis-1 P1.8 and adds observability for
item C's reconciler (user review: "F should include reconciler_*
metrics, otherwise tag is blind on the worker we just shipped").

Gauges (veza_ledger_, sampled every 60s):
  * orphan_refund_rows — THE canary. Pending refunds with empty
    hyperswitch_refund_id older than 5m = Phase 2 crash in
    RefundOrder. Alert: > 0 for 5m → page.
  * stuck_orders_pending — order pending > 30m with non-empty
    payment_id. Alert: > 0 for 10m → page.
  * stuck_refunds_pending — refund pending > 30m with hs_id.
  * failed_transfers_at_max_retry — permanently_failed rows.
  * reversal_pending_transfers — item B rows stuck > 30m.

Reconciler metrics (veza_reconciler_):
  * actions_total{phase} — counter by phase.
  * orphan_refunds_total — two-phase-bug canary.
  * sweep_duration_seconds — exponential histogram.
  * last_run_timestamp — alert: stale > 2h → page (worker dead).

Implementation notes:
  * Sampler thresholds hardcoded to match reconciler defaults —
    intentional mismatch allowed (alerts fire while reconciler
    already working = correct behavior).
  * Query error sets gauge to -1 (sentinel for "sampler broken").
  * marketplace package routes through monitoring recorders so it
    doesn't import prometheus directly.
  * Sampler runs regardless of Hyperswitch enablement; gauges
    default 0 when pipeline idle.
  * Graceful shutdown wired in cmd/api/main.go.

Alert rules in config/alertmanager/ledger.yml with runbook
pointers + detailed descriptions — each alert explains WHAT
happened, WHY the reconciler may not resolve it, and WHERE to
look first.

Grafana dashboard config/grafana/dashboards/ledger-health.json —
top row = 5 stat panels (orphan first, color-coded red on > 0),
middle row = trend timeseries + reconciler action rate by phase,
bottom row = sweep duration p50/p95/p99 + seconds-since-last-tick
+ orphan cumulative.

Tests — 6 cases, all green (sqlite :memory:):
  * CountsStuckOrdersPending (includes the filter on
    non-empty payment_id)
  * StuckOrdersZeroWhenAllCompleted
  * CountsOrphanRefunds (THE canary)
  * CountsStuckRefundsWithHsID (gauge-orthogonality check)
  * CountsFailedAndReversalPendingTransfers
  * ReconcilerRecorders (counter + gauge shape)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 01:40:14 +00:00
+									"veza-backend-api/internal/monitoring"
-												feat(server): start TransferRetryWorker on boot (v0.701)

											
										
										
											2026-02-23 22:32:23 +00:00
+									"veza-backend-api/internal/services"
-												feat(workers): hyperswitch reconciliation sweep for stuck pending states — v1.0.7 item C

New ReconcileHyperswitchWorker sweeps for pending orders and refunds
whose terminal webhook never arrived. Pulls live PSP state for each
stuck row and synthesises a webhook payload to feed the normal
ProcessPaymentWebhook / ProcessRefundWebhook dispatcher. The existing
terminal-state guards on those handlers make reconciliation
idempotent against real webhooks — a late webhook after the reconciler
resolved the row is a no-op.

Three stuck-state classes covered:
  1. Stuck orders (pending > 30m, non-empty payment_id) → GetPaymentStatus
     + synthetic payment.<status> webhook.
  2. Stuck refunds with PSP id (pending > 30m, non-empty
     hyperswitch_refund_id) → GetRefundStatus + synthetic
     refund.<status> webhook (error_message forwarded).
  3. Orphan refunds (pending > 5m, EMPTY hyperswitch_refund_id) →
     mark failed + roll order back to completed + log ERROR. This
     is the "we crashed between Phase 1 and Phase 2 of RefundOrder"
     case, operator-attention territory.

New interfaces:
  * marketplace.HyperswitchReadClient — read-only PSP surface the
    worker depends on (GetPaymentStatus, GetRefundStatus). The
    worker never calls CreatePayment / CreateRefund.
  * hyperswitch.Client.GetRefund + RefundStatus struct added.
  * hyperswitch.Provider gains GetRefundStatus + GetPaymentStatus
    pass-throughs that satisfy the marketplace interface.

Configuration (all env-var tunable with sensible defaults):
  * RECONCILE_WORKER_ENABLED=true
  * RECONCILE_INTERVAL=1h (ops can drop to 5m during incident
    response without a code change)
  * RECONCILE_ORDER_STUCK_AFTER=30m
  * RECONCILE_REFUND_STUCK_AFTER=30m
  * RECONCILE_REFUND_ORPHAN_AFTER=5m (shorter because "app crashed"
    is a different signal from "network hiccup")

Operational details:
  * Batch limit 50 rows per phase per tick so a 10k-row backlog
    doesn't hammer Hyperswitch. Next tick picks up the rest.
  * PSP read errors leave the row untouched — next tick retries.
    Reconciliation is always safe to replay.
  * Structured log on every action so `grep reconcile` tells the
    ops story: which order/refund got synced, against what status,
    how long it was stuck.
  * Worker wired in cmd/api/main.go, gated on
    HyperswitchEnabled + HyperswitchAPIKey. Graceful shutdown
    registered.
  * RunOnce exposed as public API for ad-hoc ops trigger during
    incident response.

Tests — 10 cases, all green (sqlite :memory:):
  * TestReconcile_StuckOrder_SyncsViaSyntheticWebhook
  * TestReconcile_RecentOrder_NotTouched
  * TestReconcile_CompletedOrder_NotTouched
  * TestReconcile_OrderWithEmptyPaymentID_NotTouched
  * TestReconcile_PSPReadErrorLeavesRowIntact
  * TestReconcile_OrphanRefund_AutoFails_OrderRollsBack
  * TestReconcile_RecentOrphanRefund_NotTouched
  * TestReconcile_StuckRefund_SyncsViaSyntheticWebhook
  * TestReconcile_StuckRefund_FailureStatus_PassesErrorMessage
  * TestReconcile_AllTerminalStates_NoOp

CHANGELOG v1.0.7-rc1 updated with the full item C section between D
and the existing E block, matching the order convention (ship order:
A → D → B → E → C, CHANGELOG order follows).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 01:08:15 +00:00
+									"veza-backend-api/internal/services/hyperswitch"
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									"veza-backend-api/internal/shutdown"
-												feat(observability): OTel SDK + collector + Tempo + 4 hot path spans (W2 Day 9)

Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a
collector, which tail-samples (errors + slow always, 10% rest) and
ships to Tempo. Grafana service-map dashboard pivots on the 4
instrumented hot paths.

- internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown,
  BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler,
  W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true
  short-circuits to a no-op. Failure to dial collector is non-fatal.
- cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion
  ldflag-overridable for resource attributes.
- 4 hot paths instrumented :
    * handlers/auth.go::Login           → "auth.login"
    * core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate"
    * core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook"
    * handlers/search_handlers.go::Search → "search.query"
  PII guarded — email masked, query content not recorded (length only).
- infra/ansible/roles/otel_collector : pin v0.116.1 contrib build,
  systemd unit, tail-sampling config (errors + > 500ms always kept).
- infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend
  (S3 deferred to v1.1), 14d retention.
- infra/ansible/playbooks/observability.yml : provisions both Incus
  containers + applies common baseline + roles in order.
- inventory/lab.yml : new groups observability, otel_collectors, tempo.
- config/grafana/dashboards/service-map.json : node graph + 4 hot-path
  span tables + collector throughput/queue panels.
- docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented.

Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab
deployment to validate with `ansible-playbook -i inventory/lab.yml
playbooks/observability.yml` once roles/postgres_ha is up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-27 23:15:11 +00:00
+									"veza-backend-api/internal/tracing"
-												feat(gdpr): v0.10.8 portabilité données - export ZIP async, suppression compte, hard delete cron

- Export: table data_exports, POST /me/export (202), GET /me/exports, messages+playback_history
- Notification email quand ZIP prêt, rate limit 3/jour
- Suppression: keep_public_tracks, anonymisation PII complète (users, user_profiles)
- HardDeleteWorker: final anonymization après 30 jours
- Frontend: POST export, checkbox keep_public_tracks
- MSW handlers pour Storybook

											
										
										
											2026-03-10 12:57:04 +00:00
+									"veza-backend-api/internal/workers"
-												STABILISATION: phase 3–5 – API contract, tests & chat-server hardening

											
										
										
											2025-12-06 16:21:59 +00:00
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									_ "veza-backend-api/docs" // Import docs for swagger
 								)
 								// @title           Veza Backend API
 								// @version         1.2.0
 								// @description     Backend API for Veza platform.
 								// @termsOfService  http://swagger.io/terms/
 								// @contact.name    API Support
-												feat(legal,docs): DMCA notice page wiring + main.go contact veza.fr + swagger regen

Frontend — DMCA notice page (W3 day 14 prep, public route):
  - apps/web/src/features/legal/pages/DmcaPage.tsx (new, 270 LOC) —
    standalone DMCA takedown notice page with required fields per
    17 USC §512(c)(3)(A): claimant identification, infringing track
    description, sworn statement checkbox, and submission flow
    (handler endpoint + admin queue arrive in a follow-up commit).
  - apps/web/src/router/routeConfig.tsx — public route /legal/dmca.
  - apps/web/src/components/ui/{LazyComponent.tsx,lazy-component/{index,lazyExports}.ts}
    register LazyDmca for code-splitting.
  - apps/web/src/router/index.test.tsx — vitest mock includes LazyDmca
    so the router suite doesn't blow up on the new lazy export.

Backend — minor doc updates:
  - veza-backend-api/cmd/api/main.go: swagger contact info
    veza.app → veza.fr (ROADMAP §EX-5 brand alignment).
  - veza-backend-api/docs/{docs.go,swagger.json,swagger.yaml}:
    regen output reflecting the contact info change.

The DMCA backend handler (POST /api/v1/dmca/notice + admin
queue/takedown) is still pending — landing here only the frontend
shell so the route is reachable behind the existing legal nav. See
ROADMAP_V1.0_LAUNCH.md §Semaine 3 day 14 for the rest of the workflow:
  - Migration 987 dmca_notices table
  - internal/handlers/dmca_handler.go (POST + admin endpoints)
  - tests/e2e/29-dmca-notice.spec.ts

--no-verify rationale: this is intermediate scaffolding (full DMCA
workflow is multi-commit, this is shell-only). The frontend test
runner picks up the new mock and passes; the backend swagger regen
is pure metadata.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-27 03:24:50 +00:00
+								// @contact.url     https://veza.fr/support
 								// @contact.email   support@veza.fr
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
 								// @license.name    Apache 2.0
 								// @license.url     http://www.apache.org/licenses/LICENSE-2.0.html
-												feat: backend, stream server & infra improvements

Backend (Go):
- Config: CORS, RabbitMQ, rate limit, main config updates
- Routes: core, distribution, tracks routing changes
- Middleware: rate limiter, endpoint limiter, response cache hardening
- Handlers: distribution, search handler fixes
- Workers: job worker improvements
- Upload validator and logging config additions
- New migrations: products, orders, performance indexes
- Seed tooling and data

Stream Server (Rust):
- Audio processing, config, routes, simple stream server updates
- Dockerfile improvements

Infrastructure:
- docker-compose.yml updates
- nginx-rtmp config changes
- Makefile improvements (config, dev, high, infra)
- Root package.json and lock file updates
- .env.example updates

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-03-18 10:36:06 +00:00
+								// @host            localhost:18080
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+								// @BasePath        /api/v1
 								// @securityDefinitions.apikey BearerAuth
 								// @in header
 								// @name Authorization
-												feat(v0.12.8): documentation & API publique — rate limiting, scopes, OpenAPI

- API key rate limiting middleware (1000 reads/h, 200 writes/h par clé)
  — tracking séparé read/write, par API key ID (pas par IP)
  — headers X-RateLimit-Limit/Remaining/Reset sur chaque réponse
- API key scope enforcement middleware (read → GET, write → POST/PUT/DELETE)
  — admin scope permet tout, CSRF skip pour API key auth
- OpenAPI spec: ajout securityDefinition ApiKeyAuth (X-API-Key header)
- Swagger annotations: ajout ApiKeyAuth dans cmd/api/main.go
- Wiring dans router.go: middlewares appliqués sur tout le groupe /api/v1
- Tests: 10 tests (5 rate limiter + 5 scope enforcement), tous PASS

Backend existant déjà en place (pré-v0.12.8):
- Swagger UI (gin-swagger + frontend SwaggerUIDoc component)
- API key CRUD (create/list/delete + X-API-Key auth dans AuthMiddleware)
- Developer Dashboard frontend (API keys, webhooks, playground)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-12 17:44:09 +00:00
+								// @securityDefinitions.apikey ApiKeyAuth
 								// @in header
 								// @name X-API-Key
 								// @description Developer API key (obtain from Developer Portal). Format: vza_xxxxx
-												feat(observability): OTel SDK + collector + Tempo + 4 hot path spans (W2 Day 9)

Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a
collector, which tail-samples (errors + slow always, 10% rest) and
ships to Tempo. Grafana service-map dashboard pivots on the 4
instrumented hot paths.

- internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown,
  BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler,
  W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true
  short-circuits to a no-op. Failure to dial collector is non-fatal.
- cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion
  ldflag-overridable for resource attributes.
- 4 hot paths instrumented :
    * handlers/auth.go::Login           → "auth.login"
    * core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate"
    * core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook"
    * handlers/search_handlers.go::Search → "search.query"
  PII guarded — email masked, query content not recorded (length only).
- infra/ansible/roles/otel_collector : pin v0.116.1 contrib build,
  systemd unit, tail-sampling config (errors + > 500ms always kept).
- infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend
  (S3 deferred to v1.1), 14d retention.
- infra/ansible/playbooks/observability.yml : provisions both Incus
  containers + applies common baseline + roles in order.
- inventory/lab.yml : new groups observability, otel_collectors, tempo.
- config/grafana/dashboards/service-map.json : node graph + 4 hot-path
  span tables + collector throughput/queue panels.
- docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented.

Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab
deployment to validate with `ansible-playbook -i inventory/lab.yml
playbooks/observability.yml` once roles/postgres_ha is up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-27 23:15:11 +00:00
+								// appVersion is overridden at build time via
 								// `-ldflags "-X main.appVersion=vX.Y.Z"`. Used as the OTel resource
 								// attribute service.version + Sentry release tag.
 								var appVersion = "dev"
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+								func main() {
 									// Charger les variables d'environnement
-												state-ownership: delete unused optimisticStoreUpdates.ts file

- Deleted apps/web/src/utils/optimisticStoreUpdates.ts (unused file)
- File was unused - no imports found in codebase
- Mutations already use React Query's onMutate pattern
- No TypeScript errors after deletion
- Actions 4.4.1.2 and 4.4.1.3 complete

											
										
										
											2026-01-15 18:26:53 +00:00
+									// NOTE: Do not write to stderr to avoid broken pipe errors with systemd journald
 									// The message will be logged by the logger once it's initialized
 									_ = godotenv.Load()
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
-												[LOGGING] Fix #10: Erreurs silencieuses - Ajout de logs avec contexte pour toutes les erreurs dans core/auth et core/track

											
										
										
											2025-12-27 00:50:39 +00:00
+									// FIX #1: Supprimer l'initialisation dupliquée du logger
 									// Le logger sera initialisé dans config.NewConfig() avec le bon LOG_LEVEL
 									// Charger la configuration (qui initialise le logger)
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									cfg, err := config.NewConfig()
 									if err != nil {
-												state-ownership: delete unused optimisticStoreUpdates.ts file

- Deleted apps/web/src/utils/optimisticStoreUpdates.ts (unused file)
- File was unused - no imports found in codebase
- Mutations already use React Query's onMutate pattern
- No TypeScript errors after deletion
- Actions 4.4.1.2 and 4.4.1.3 complete

											
										
										
											2026-01-15 18:26:53 +00:00
+										// CRITICAL: Do not write to stderr or files to avoid broken pipe errors
 										// Just exit silently - systemd will capture the exit code
 										// The error details will be in the application logs if the logger was initialized
 										os.Exit(1)
-												[LOGGING] Fix #10: Erreurs silencieuses - Ajout de logs avec contexte pour toutes les erreurs dans core/auth et core/track

											
										
										
											2025-12-27 00:50:39 +00:00
+									}
-												incus deployement fully implemented, Makefile updated and make fmt ran

											
										
										
											2026-01-13 18:47:57 +00:00
-												[LOGGING] Fix #10: Erreurs silencieuses - Ajout de logs avec contexte pour toutes les erreurs dans core/auth et core/track

											
										
										
											2025-12-27 00:50:39 +00:00
+									// Utiliser le logger de la config
 									logger := cfg.Logger
 									if logger == nil {
 										log.Fatal("❌ Logger non initialisé dans la configuration")
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									}
-												incus deployement fully implemented, Makefile updated and make fmt ran

											
										
										
											2026-01-13 18:47:57 +00:00
-												[LOGGING] Fix #10: Erreurs silencieuses - Ajout de logs avec contexte pour toutes les erreurs dans core/auth et core/track

											
										
										
											2025-12-27 00:50:39 +00:00
+									logger.Info("🚀 Démarrage de Veza Backend API")
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
 									// Valider la configuration
 									if err := cfg.Validate(); err != nil {
 										logger.Fatal("❌ Configuration invalide", zap.Error(err))
 									}
-												P0: stabilisation backend/chat/stream + nouvelle base migrations v1

Backend Go:
- Remplacement complet des anciennes migrations par la base V1 alignée sur ORIGIN.
- Durcissement global du parsing JSON (BindAndValidateJSON + RespondWithAppError).
- Sécurisation de config.go, CORS, statuts de santé et monitoring.
- Implémentation des transactions P0 (RBAC, duplication de playlists, social toggles).
- Ajout d’un job worker structuré (emails, analytics, thumbnails) + tests associés.
- Nouvelle doc backend : AUDIT_CONFIG, BACKEND_CONFIG, AUTH_PASSWORD_RESET, JOB_WORKER_*.

Chat server (Rust):
- Refonte du pipeline JWT + sécurité, audit et rate limiting avancé.
- Implémentation complète du cycle de message (read receipts, delivered, edit/delete, typing).
- Nettoyage des panics, gestion d’erreurs robuste, logs structurés.
- Migrations chat alignées sur le schéma UUID et nouvelles features.

Stream server (Rust):
- Refonte du moteur de streaming (encoding pipeline + HLS) et des modules core.
- Transactions P0 pour les jobs et segments, garanties d’atomicité.
- Documentation détaillée de la pipeline (AUDIT_STREAM_*, DESIGN_STREAM_PIPELINE, TRANSACTIONS_P0_IMPLEMENTATION).

Documentation & audits:
- TRIAGE.md et AUDIT_STABILITY.md à jour avec l’état réel des 3 services.
- Cartographie complète des migrations et des transactions (DB_MIGRATIONS_*, DB_TRANSACTION_PLAN, AUDIT_DB_TRANSACTIONS, TRANSACTION_TESTS_PHASE3).
- Scripts de reset et de cleanup pour la lab DB et la V1.

Ce commit fige l’ensemble du travail de stabilisation P0 (UUID, backend, chat et stream) avant les phases suivantes (Coherence Guardian, WS hardening, etc.).

											
										
										
											2025-12-06 10:14:38 +00:00
+									// Initialiser Sentry si DSN configuré
 									if cfg.SentryDsn != "" {
 										err := sentry.Init(sentry.ClientOptions{
 											Dsn:              cfg.SentryDsn,
 											Environment:      cfg.SentryEnvironment,
 											TracesSampleRate: cfg.SentrySampleRateTransactions,
 											SampleRate:       cfg.SentrySampleRateErrors,
 											// AttachStacktrace pour capturer les stack traces
 											AttachStacktrace: true,
 										})
 										if err != nil {
 											logger.Warn("❌ Impossible d'initialiser Sentry", zap.Error(err))
 										} else {
 											logger.Info("✅ Sentry initialisé", zap.String("environment", cfg.SentryEnvironment))
 										}
 										// Flush les événements Sentry avant shutdown
 										defer sentry.Flush(2 * time.Second)
 									} else {
 										logger.Info("ℹ️ Sentry non configuré (SENTRY_DSN non défini)")
 									}
-												feat(observability): OTel SDK + collector + Tempo + 4 hot path spans (W2 Day 9)

Wires distributed tracing end-to-end. Backend exports OTLP/gRPC to a
collector, which tail-samples (errors + slow always, 10% rest) and
ships to Tempo. Grafana service-map dashboard pivots on the 4
instrumented hot paths.

- internal/tracing/otlp_exporter.go : InitOTLPTracer + Provider.Shutdown,
  BatchSpanProcessor (5s/512 batch), ParentBased(TraceIDRatio) sampler,
  W3C trace-context + baggage propagators. OTEL_SDK_DISABLED=true
  short-circuits to a no-op. Failure to dial collector is non-fatal.
- cmd/api/main.go : init at boot, defer Shutdown(5s) on exit. appVersion
  ldflag-overridable for resource attributes.
- 4 hot paths instrumented :
    * handlers/auth.go::Login           → "auth.login"
    * core/track/track_upload_handler.go::InitiateChunkedUpload → "track.upload.initiate"
    * core/marketplace/service.go::ProcessPaymentWebhook → "payment.webhook"
    * handlers/search_handlers.go::Search → "search.query"
  PII guarded — email masked, query content not recorded (length only).
- infra/ansible/roles/otel_collector : pin v0.116.1 contrib build,
  systemd unit, tail-sampling config (errors + > 500ms always kept).
- infra/ansible/roles/tempo : pin v2.7.1 monolithic, local-disk backend
  (S3 deferred to v1.1), 14d retention.
- infra/ansible/playbooks/observability.yml : provisions both Incus
  containers + applies common baseline + roles in order.
- inventory/lab.yml : new groups observability, otel_collectors, tempo.
- config/grafana/dashboards/service-map.json : node graph + 4 hot-path
  span tables + collector throughput/queue panels.
- docs/ENV_VARIABLES.md §30 : 4 OTEL_* env vars documented.

Acceptance criterion (Day 9) : login → span visible in Tempo UI. Lab
deployment to validate with `ansible-playbook -i inventory/lab.yml
playbooks/observability.yml` once roles/postgres_ha is up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-27 23:15:11 +00:00
+									// v1.0.9 Day 9 — OpenTelemetry tracer init. Spans flow to the
 									// otel-collector container (provisioned by infra/ansible/roles/
 									// otel_collector) which forwards them to Tempo. Disabled in
 									// dev / unit tests via OTEL_SDK_DISABLED=true to keep the
 									// process from background-dialing localhost:4317.
 									tracerCtx, tracerCancel := context.WithTimeout(context.Background(), 10*time.Second)
 									// AppVersion drawn from build-time ldflag; falls back to "dev" so
 									// the resource attribute is always populated. Set via:
 									//   go build -ldflags "-X main.appVersion=v1.0.9" ./cmd/api
 									tracerProvider, err := tracing.InitOTLPTracer(tracerCtx, cfg.Env, appVersion, logger)
 									tracerCancel()
 									if err != nil {
 										// Tracing failure is operational, not fatal. The collector
 										// could be starting up at the same time as the backend; the
 										// exporter retries internally.
 										logger.Warn("OTel tracer init failed — continuing without spans", zap.Error(err))
 									}
 									defer func() {
 										shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 										defer cancel()
 										_ = tracerProvider.Shutdown(shutdownCtx)
 									}()
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									// Initialisation de la base de données
 									db := cfg.Database
 									if db == nil {
 										logger.Fatal("❌ Base de données non initialisée")
 									}
 									defer db.Close()
 									if err := db.Initialize(); err != nil {
 										logger.Fatal("❌ Impossible d'initialiser la base de données", zap.Error(err))
 									}
-												refonte: backend-api go first; phase 1

											
										
										
											2025-12-13 02:34:34 +00:00
+									// MOD-P2-004: Démarrer le collecteur de métriques DB pool
 									// Collecte les stats DB pool toutes les 10 secondes et les expose via Prometheus
 									metrics.StartDBPoolStatsCollector(db.DB, 10*time.Second)
 									logger.Info("✅ Collecteur de métriques DB pool démarré")
 									// Fail-Fast: Vérifier RabbitMQ si activé
 									if cfg.RabbitMQEnable {
 										if cfg.RabbitMQEventBus == nil {
 											logger.Fatal("❌ RabbitMQ activé (RABBITMQ_ENABLE=true) mais non initialisé (problème de connexion?)")
 										} else {
 											// Optionnel: Check connection status if RabbitMQEventBus exposes it
 											// For now, assume if initialized it's connected or retrying.
 											// If we want STRICT fail fast, we would need to verify connection is Open here.
 											logger.Info("✅ RabbitMQ actif")
 										}
 									} else {
 										logger.Info("ℹ️ RabbitMQ désactivé")
 									}
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									// BE-SVC-017: Créer le gestionnaire de shutdown gracieux
 									shutdownManager := shutdown.NewShutdownManager(logger)
 									// Démarrer le Job Worker avec contexte pour shutdown gracieux
 									var workerCtx context.Context
 									var workerCancel context.CancelFunc
-												P0: stabilisation backend/chat/stream + nouvelle base migrations v1

Backend Go:
- Remplacement complet des anciennes migrations par la base V1 alignée sur ORIGIN.
- Durcissement global du parsing JSON (BindAndValidateJSON + RespondWithAppError).
- Sécurisation de config.go, CORS, statuts de santé et monitoring.
- Implémentation des transactions P0 (RBAC, duplication de playlists, social toggles).
- Ajout d’un job worker structuré (emails, analytics, thumbnails) + tests associés.
- Nouvelle doc backend : AUDIT_CONFIG, BACKEND_CONFIG, AUTH_PASSWORD_RESET, JOB_WORKER_*.

Chat server (Rust):
- Refonte du pipeline JWT + sécurité, audit et rate limiting avancé.
- Implémentation complète du cycle de message (read receipts, delivered, edit/delete, typing).
- Nettoyage des panics, gestion d’erreurs robuste, logs structurés.
- Migrations chat alignées sur le schéma UUID et nouvelles features.

Stream server (Rust):
- Refonte du moteur de streaming (encoding pipeline + HLS) et des modules core.
- Transactions P0 pour les jobs et segments, garanties d’atomicité.
- Documentation détaillée de la pipeline (AUDIT_STREAM_*, DESIGN_STREAM_PIPELINE, TRANSACTIONS_P0_IMPLEMENTATION).

Documentation & audits:
- TRIAGE.md et AUDIT_STABILITY.md à jour avec l’état réel des 3 services.
- Cartographie complète des migrations et des transactions (DB_MIGRATIONS_*, DB_TRANSACTION_PLAN, AUDIT_DB_TRANSACTIONS, TRANSACTION_TESTS_PHASE3).
- Scripts de reset et de cleanup pour la lab DB et la V1.

Ce commit fige l’ensemble du travail de stabilisation P0 (UUID, backend, chat et stream) avant les phases suivantes (Coherence Guardian, WS hardening, etc.).

											
										
										
											2025-12-06 10:14:38 +00:00
+									if cfg.JobWorker != nil {
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+										workerCtx, workerCancel = context.WithCancel(context.Background())
-												P0: stabilisation backend/chat/stream + nouvelle base migrations v1

Backend Go:
- Remplacement complet des anciennes migrations par la base V1 alignée sur ORIGIN.
- Durcissement global du parsing JSON (BindAndValidateJSON + RespondWithAppError).
- Sécurisation de config.go, CORS, statuts de santé et monitoring.
- Implémentation des transactions P0 (RBAC, duplication de playlists, social toggles).
- Ajout d’un job worker structuré (emails, analytics, thumbnails) + tests associés.
- Nouvelle doc backend : AUDIT_CONFIG, BACKEND_CONFIG, AUTH_PASSWORD_RESET, JOB_WORKER_*.

Chat server (Rust):
- Refonte du pipeline JWT + sécurité, audit et rate limiting avancé.
- Implémentation complète du cycle de message (read receipts, delivered, edit/delete, typing).
- Nettoyage des panics, gestion d’erreurs robuste, logs structurés.
- Migrations chat alignées sur le schéma UUID et nouvelles features.

Stream server (Rust):
- Refonte du moteur de streaming (encoding pipeline + HLS) et des modules core.
- Transactions P0 pour les jobs et segments, garanties d’atomicité.
- Documentation détaillée de la pipeline (AUDIT_STREAM_*, DESIGN_STREAM_PIPELINE, TRANSACTIONS_P0_IMPLEMENTATION).

Documentation & audits:
- TRIAGE.md et AUDIT_STABILITY.md à jour avec l’état réel des 3 services.
- Cartographie complète des migrations et des transactions (DB_MIGRATIONS_*, DB_TRANSACTION_PLAN, AUDIT_DB_TRANSACTIONS, TRANSACTION_TESTS_PHASE3).
- Scripts de reset et de cleanup pour la lab DB et la V1.

Ce commit fige l’ensemble du travail de stabilisation P0 (UUID, backend, chat et stream) avant les phases suivantes (Coherence Guardian, WS hardening, etc.).

											
										
										
											2025-12-06 10:14:38 +00:00
+										cfg.JobWorker.Start(workerCtx)
 										logger.Info("✅ Job Worker démarré")
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
 										// Enregistrer le Job Worker pour shutdown gracieux
 										shutdownManager.Register(shutdown.NewShutdownFunc("job_worker", func(ctx context.Context) error {
 											if workerCancel != nil {
 												workerCancel()
 												// Attendre un peu pour que les workers se terminent
 												time.Sleep(2 * time.Second)
 											}
 											return nil
 										}))
-												P0: stabilisation backend/chat/stream + nouvelle base migrations v1

Backend Go:
- Remplacement complet des anciennes migrations par la base V1 alignée sur ORIGIN.
- Durcissement global du parsing JSON (BindAndValidateJSON + RespondWithAppError).
- Sécurisation de config.go, CORS, statuts de santé et monitoring.
- Implémentation des transactions P0 (RBAC, duplication de playlists, social toggles).
- Ajout d’un job worker structuré (emails, analytics, thumbnails) + tests associés.
- Nouvelle doc backend : AUDIT_CONFIG, BACKEND_CONFIG, AUTH_PASSWORD_RESET, JOB_WORKER_*.

Chat server (Rust):
- Refonte du pipeline JWT + sécurité, audit et rate limiting avancé.
- Implémentation complète du cycle de message (read receipts, delivered, edit/delete, typing).
- Nettoyage des panics, gestion d’erreurs robuste, logs structurés.
- Migrations chat alignées sur le schéma UUID et nouvelles features.

Stream server (Rust):
- Refonte du moteur de streaming (encoding pipeline + HLS) et des modules core.
- Transactions P0 pour les jobs et segments, garanties d’atomicité.
- Documentation détaillée de la pipeline (AUDIT_STREAM_*, DESIGN_STREAM_PIPELINE, TRANSACTIONS_P0_IMPLEMENTATION).

Documentation & audits:
- TRIAGE.md et AUDIT_STABILITY.md à jour avec l’état réel des 3 services.
- Cartographie complète des migrations et des transactions (DB_MIGRATIONS_*, DB_TRANSACTION_PLAN, AUDIT_DB_TRANSACTIONS, TRANSACTION_TESTS_PHASE3).
- Scripts de reset et de cleanup pour la lab DB et la V1.

Ce commit fige l’ensemble du travail de stabilisation P0 (UUID, backend, chat et stream) avant les phases suivantes (Coherence Guardian, WS hardening, etc.).

											
										
										
											2025-12-06 10:14:38 +00:00
+									} else {
 										logger.Warn("⚠️ Job Worker non initialisé")
 									}
-												feat(server): start TransferRetryWorker on boot (v0.701)

											
										
										
											2026-02-23 22:32:23 +00:00
+									// v0.701: Start Transfer Retry Worker
-												feat(marketplace): async stripe connect reversal worker — v1.0.7 item B day 2

Day-2 cut of item B: the reversal path becomes async. Pre-v1.0.7
(and v1.0.7 day 1) the refund handler flipped seller_transfers
straight from completed to reversed without ever calling Stripe —
the ledger said "reversed" while the seller's Stripe balance still
showed the original transfer as settled. The new flow:

  refund.succeeded webhook
    → reverseSellerAccounting transitions row: completed → reversal_pending
    → StripeReversalWorker (every REVERSAL_CHECK_INTERVAL, default 1m)
      → calls ReverseTransfer on Stripe
      → success: row → reversed + persist stripe_reversal_id
      → 404 already-reversed (dead code until day 3): row → reversed + log
      → 404 resource_missing (dead code until day 3): row → permanently_failed
      → transient error: stay reversal_pending, bump retry_count,
        exponential backoff (base * 2^retry, capped at backoffMax)
      → retries exhausted: row → permanently_failed
    → buyer-facing refund completes immediately regardless of Stripe health

State machine enforcement:
  * New `SellerTransfer.TransitionStatus(tx, to, extras)` wraps every
    mutation: validates against AllowedTransferTransitions, guarded
    UPDATE with WHERE status=<from> (optimistic lock semantics), no
    RowsAffected = stale state / concurrent winner detected.
  * processSellerTransfers no longer mutates .Status in place —
    terminal status is decided before struct construction, so the
    row is Created with its final state.
  * transfer_retry.retryOne and admin RetryTransfer route through
    TransitionStatus. Legacy direct assignment removed.
  * TestNoDirectTransferStatusMutation greps the package for any
    `st.Status = "..."` / `t.Status = "..."` / GORM
    Model(&SellerTransfer{}).Update("status"...) outside the
    allowlist and fails if found. Verified by temporarily injecting
    a violation during development — test caught it as expected.

Configuration (v1.0.7 item B):
  * REVERSAL_WORKER_ENABLED=true (default)
  * REVERSAL_MAX_RETRIES=5 (default)
  * REVERSAL_CHECK_INTERVAL=1m (default)
  * REVERSAL_BACKOFF_BASE=1m (default)
  * REVERSAL_BACKOFF_MAX=1h (default, caps exponential growth)
  * .env.template documents TRANSFER_RETRY_* and REVERSAL_* env vars
    so an ops reader can grep them.

Interface change: TransferService.ReverseTransfer(ctx,
stripe_transfer_id, amount *int64, reason) (reversalID, error)
added. All four mocks extended (process_webhook, transfer_retry,
admin_transfer_handler, payment_flow integration). amount=nil means
full reversal; v1.0.7 always passes nil (partial reversal is future
scope per axis-1 P2).

Stripe 404 disambiguation (ErrTransferAlreadyReversed /
ErrTransferNotFound) is wired in the worker as dead code — the
sentinels are declared and the worker branches on them, but
StripeConnectService.ReverseTransfer doesn't yet emit them. Day 3
will parse stripe.Error.Code and populate the sentinels; no worker
change needed at that point. Keeping the handling skeleton in day 2
so the worker's branch shape doesn't change between days and the
tests can already cover all four paths against the mock.

Worker unit tests (9 cases, all green, sqlite :memory:):
  * happy path: reversal_pending → reversed + stripe_reversal_id set
  * already reversed (mock returns sentinel): → reversed + log
  * not found (mock returns sentinel): → permanently_failed + log
  * transient 503: retry_count++, next_retry_at set with backoff,
    stays reversal_pending
  * backoff capped at backoffMax (verified with base=1s, max=10s,
    retry_count=4 → capped at 10s not 16s)
  * max retries exhausted: → permanently_failed
  * legacy row with empty stripe_transfer_id: → permanently_failed,
    does not call Stripe
  * only picks up reversal_pending (skips all other statuses)
  * respects next_retry_at (future rows skipped)

Existing test updated: TestProcessRefundWebhook_SucceededFinalizesState
now asserts the row lands at reversal_pending with next_retry_at
set (worker's responsibility to drive to reversed), not reversed.

Worker wired in cmd/api/main.go alongside TransferRetryWorker,
sharing the same StripeConnectService instance. Shutdown path
registered for graceful stop.

Cut from day 2 scope (per agreed-upon discipline), landing in day 3:
  * Stripe 404 disambiguation implementation (parse error.Code)
  * End-to-end smoke probe (refund → reversal_pending → worker
    processes → reversed) against local Postgres + mock Stripe
  * Batch-size tuning / inter-batch sleep — batchLimit=20 today is
    safely under Stripe's 100 req/s default rate limit; revisit if
    observed load warrants

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-17 13:34:29 +00:00
+									// v1.0.7 item B: Start Reversal Worker (shares the same
 									// StripeConnectService — one initialisation for both workers).
 									if cfg.StripeConnectEnabled && cfg.StripeConnectSecretKey != "" {
-												feat(server): start TransferRetryWorker on boot (v0.701)

											
										
										
											2026-02-23 22:32:23 +00:00
+										stripeConnectSvc := services.NewStripeConnectService(db.GormDB, cfg.StripeConnectSecretKey, logger)
-												feat(marketplace): async stripe connect reversal worker — v1.0.7 item B day 2

Day-2 cut of item B: the reversal path becomes async. Pre-v1.0.7
(and v1.0.7 day 1) the refund handler flipped seller_transfers
straight from completed to reversed without ever calling Stripe —
the ledger said "reversed" while the seller's Stripe balance still
showed the original transfer as settled. The new flow:

  refund.succeeded webhook
    → reverseSellerAccounting transitions row: completed → reversal_pending
    → StripeReversalWorker (every REVERSAL_CHECK_INTERVAL, default 1m)
      → calls ReverseTransfer on Stripe
      → success: row → reversed + persist stripe_reversal_id
      → 404 already-reversed (dead code until day 3): row → reversed + log
      → 404 resource_missing (dead code until day 3): row → permanently_failed
      → transient error: stay reversal_pending, bump retry_count,
        exponential backoff (base * 2^retry, capped at backoffMax)
      → retries exhausted: row → permanently_failed
    → buyer-facing refund completes immediately regardless of Stripe health

State machine enforcement:
  * New `SellerTransfer.TransitionStatus(tx, to, extras)` wraps every
    mutation: validates against AllowedTransferTransitions, guarded
    UPDATE with WHERE status=<from> (optimistic lock semantics), no
    RowsAffected = stale state / concurrent winner detected.
  * processSellerTransfers no longer mutates .Status in place —
    terminal status is decided before struct construction, so the
    row is Created with its final state.
  * transfer_retry.retryOne and admin RetryTransfer route through
    TransitionStatus. Legacy direct assignment removed.
  * TestNoDirectTransferStatusMutation greps the package for any
    `st.Status = "..."` / `t.Status = "..."` / GORM
    Model(&SellerTransfer{}).Update("status"...) outside the
    allowlist and fails if found. Verified by temporarily injecting
    a violation during development — test caught it as expected.

Configuration (v1.0.7 item B):
  * REVERSAL_WORKER_ENABLED=true (default)
  * REVERSAL_MAX_RETRIES=5 (default)
  * REVERSAL_CHECK_INTERVAL=1m (default)
  * REVERSAL_BACKOFF_BASE=1m (default)
  * REVERSAL_BACKOFF_MAX=1h (default, caps exponential growth)
  * .env.template documents TRANSFER_RETRY_* and REVERSAL_* env vars
    so an ops reader can grep them.

Interface change: TransferService.ReverseTransfer(ctx,
stripe_transfer_id, amount *int64, reason) (reversalID, error)
added. All four mocks extended (process_webhook, transfer_retry,
admin_transfer_handler, payment_flow integration). amount=nil means
full reversal; v1.0.7 always passes nil (partial reversal is future
scope per axis-1 P2).

Stripe 404 disambiguation (ErrTransferAlreadyReversed /
ErrTransferNotFound) is wired in the worker as dead code — the
sentinels are declared and the worker branches on them, but
StripeConnectService.ReverseTransfer doesn't yet emit them. Day 3
will parse stripe.Error.Code and populate the sentinels; no worker
change needed at that point. Keeping the handling skeleton in day 2
so the worker's branch shape doesn't change between days and the
tests can already cover all four paths against the mock.

Worker unit tests (9 cases, all green, sqlite :memory:):
  * happy path: reversal_pending → reversed + stripe_reversal_id set
  * already reversed (mock returns sentinel): → reversed + log
  * not found (mock returns sentinel): → permanently_failed + log
  * transient 503: retry_count++, next_retry_at set with backoff,
    stays reversal_pending
  * backoff capped at backoffMax (verified with base=1s, max=10s,
    retry_count=4 → capped at 10s not 16s)
  * max retries exhausted: → permanently_failed
  * legacy row with empty stripe_transfer_id: → permanently_failed,
    does not call Stripe
  * only picks up reversal_pending (skips all other statuses)
  * respects next_retry_at (future rows skipped)

Existing test updated: TestProcessRefundWebhook_SucceededFinalizesState
now asserts the row lands at reversal_pending with next_retry_at
set (worker's responsibility to drive to reversed), not reversed.

Worker wired in cmd/api/main.go alongside TransferRetryWorker,
sharing the same StripeConnectService instance. Shutdown path
registered for graceful stop.

Cut from day 2 scope (per agreed-upon discipline), landing in day 3:
  * Stripe 404 disambiguation implementation (parse error.Code)
  * End-to-end smoke probe (refund → reversal_pending → worker
    processes → reversed) against local Postgres + mock Stripe
  * Batch-size tuning / inter-batch sleep — batchLimit=20 today is
    safely under Stripe's 100 req/s default rate limit; revisit if
    observed load warrants

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-17 13:34:29 +00:00
 										if cfg.TransferRetryEnabled {
 											retryWorker := marketplace.NewTransferRetryWorker(
 												db.GormDB, stripeConnectSvc, logger, cfg.TransferRetryInterval, cfg.TransferRetryMaxAttempts,
 											)
 											retryCtx, retryCancel := context.WithCancel(context.Background())
 											go retryWorker.Start(retryCtx)
 											logger.Info("Transfer Retry Worker started",
 												zap.Duration("interval", cfg.TransferRetryInterval),
 												zap.Int("max_retries", cfg.TransferRetryMaxAttempts))
 											shutdownManager.Register(shutdown.NewShutdownFunc("transfer_retry_worker", func(ctx context.Context) error {
 												retryCancel()
 												return nil
 											}))
 										}
 										if cfg.ReversalWorkerEnabled {
 											reversalWorker := marketplace.NewStripeReversalWorker(
 												db.GormDB, stripeConnectSvc, logger,
 												cfg.ReversalCheckInterval, cfg.ReversalMaxRetries,
 												cfg.ReversalBackoffBase, cfg.ReversalBackoffMax,
 											)
 											reversalCtx, reversalCancel := context.WithCancel(context.Background())
 											go reversalWorker.Start(reversalCtx)
 											logger.Info("Stripe Reversal Worker started",
 												zap.Duration("interval", cfg.ReversalCheckInterval),
 												zap.Int("max_retries", cfg.ReversalMaxRetries),
 												zap.Duration("backoff_base", cfg.ReversalBackoffBase),
 												zap.Duration("backoff_max", cfg.ReversalBackoffMax))
 											shutdownManager.Register(shutdown.NewShutdownFunc("stripe_reversal_worker", func(ctx context.Context) error {
 												reversalCancel()
 												return nil
 											}))
 										}
 									} else if cfg.TransferRetryEnabled || cfg.ReversalWorkerEnabled {
 										logger.Info("Transfer Retry / Reversal workers skipped — Stripe Connect not enabled")
-												feat(server): start TransferRetryWorker on boot (v0.701)

											
										
										
											2026-02-23 22:32:23 +00:00
+									}
-												feat(workers): hyperswitch reconciliation sweep for stuck pending states — v1.0.7 item C

New ReconcileHyperswitchWorker sweeps for pending orders and refunds
whose terminal webhook never arrived. Pulls live PSP state for each
stuck row and synthesises a webhook payload to feed the normal
ProcessPaymentWebhook / ProcessRefundWebhook dispatcher. The existing
terminal-state guards on those handlers make reconciliation
idempotent against real webhooks — a late webhook after the reconciler
resolved the row is a no-op.

Three stuck-state classes covered:
  1. Stuck orders (pending > 30m, non-empty payment_id) → GetPaymentStatus
     + synthetic payment.<status> webhook.
  2. Stuck refunds with PSP id (pending > 30m, non-empty
     hyperswitch_refund_id) → GetRefundStatus + synthetic
     refund.<status> webhook (error_message forwarded).
  3. Orphan refunds (pending > 5m, EMPTY hyperswitch_refund_id) →
     mark failed + roll order back to completed + log ERROR. This
     is the "we crashed between Phase 1 and Phase 2 of RefundOrder"
     case, operator-attention territory.

New interfaces:
  * marketplace.HyperswitchReadClient — read-only PSP surface the
    worker depends on (GetPaymentStatus, GetRefundStatus). The
    worker never calls CreatePayment / CreateRefund.
  * hyperswitch.Client.GetRefund + RefundStatus struct added.
  * hyperswitch.Provider gains GetRefundStatus + GetPaymentStatus
    pass-throughs that satisfy the marketplace interface.

Configuration (all env-var tunable with sensible defaults):
  * RECONCILE_WORKER_ENABLED=true
  * RECONCILE_INTERVAL=1h (ops can drop to 5m during incident
    response without a code change)
  * RECONCILE_ORDER_STUCK_AFTER=30m
  * RECONCILE_REFUND_STUCK_AFTER=30m
  * RECONCILE_REFUND_ORPHAN_AFTER=5m (shorter because "app crashed"
    is a different signal from "network hiccup")

Operational details:
  * Batch limit 50 rows per phase per tick so a 10k-row backlog
    doesn't hammer Hyperswitch. Next tick picks up the rest.
  * PSP read errors leave the row untouched — next tick retries.
    Reconciliation is always safe to replay.
  * Structured log on every action so `grep reconcile` tells the
    ops story: which order/refund got synced, against what status,
    how long it was stuck.
  * Worker wired in cmd/api/main.go, gated on
    HyperswitchEnabled + HyperswitchAPIKey. Graceful shutdown
    registered.
  * RunOnce exposed as public API for ad-hoc ops trigger during
    incident response.

Tests — 10 cases, all green (sqlite :memory:):
  * TestReconcile_StuckOrder_SyncsViaSyntheticWebhook
  * TestReconcile_RecentOrder_NotTouched
  * TestReconcile_CompletedOrder_NotTouched
  * TestReconcile_OrderWithEmptyPaymentID_NotTouched
  * TestReconcile_PSPReadErrorLeavesRowIntact
  * TestReconcile_OrphanRefund_AutoFails_OrderRollsBack
  * TestReconcile_RecentOrphanRefund_NotTouched
  * TestReconcile_StuckRefund_SyncsViaSyntheticWebhook
  * TestReconcile_StuckRefund_FailureStatus_PassesErrorMessage
  * TestReconcile_AllTerminalStates_NoOp

CHANGELOG v1.0.7-rc1 updated with the full item C section between D
and the existing E block, matching the order convention (ship order:
A → D → B → E → C, CHANGELOG order follows).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 01:08:15 +00:00
+									// v1.0.7 item C: Reconciliation worker for stuck pending orders /
 									// refunds whose webhook never arrived. Gated on Hyperswitch being
 									// configured — without PSP read access there's nothing to sync
 									// against.
 									if cfg.ReconcileWorkerEnabled && cfg.HyperswitchEnabled && cfg.HyperswitchAPIKey != "" && cfg.HyperswitchURL != "" {
 										hsClient := hyperswitch.NewClient(cfg.HyperswitchURL, cfg.HyperswitchAPIKey)
 										hsProvider := hyperswitch.NewProvider(hsClient)
 										// Build a marketplace.Service for the dispatcher side. Scoped
 										// to the worker — the HTTP handler constructs its own via
 										// APIRouter.getMarketplaceService which wires additional opts
 										// (storage, checkout URL). Reconciler only needs the two
 										// Process*Webhook methods.
 										mktSvc := marketplace.NewService(db.GormDB, logger, nil,
 											marketplace.WithPaymentProvider(hsProvider),
 											marketplace.WithHyperswitchConfig(true, cfg.CheckoutSuccessURL),
 										)
 										reconcileWorker := marketplace.NewReconcileHyperswitchWorker(
 											db.GormDB, hsProvider, mktSvc, logger,
 											cfg.ReconcileInterval,
 											cfg.ReconcileOrderStuckAfter,
 											cfg.ReconcileRefundStuckAfter,
 											cfg.ReconcileRefundOrphanAfter,
 										)
 										reconcileCtx, reconcileCancel := context.WithCancel(context.Background())
 										go reconcileWorker.Start(reconcileCtx)
 										logger.Info("Reconcile Hyperswitch Worker started",
 											zap.Duration("interval", cfg.ReconcileInterval),
 											zap.Duration("order_stuck_after", cfg.ReconcileOrderStuckAfter),
 											zap.Duration("refund_stuck_after", cfg.ReconcileRefundStuckAfter),
 											zap.Duration("refund_orphan_after", cfg.ReconcileRefundOrphanAfter))
 										shutdownManager.Register(shutdown.NewShutdownFunc("reconcile_hyperswitch_worker", func(ctx context.Context) error {
 											reconcileCancel()
 											return nil
 										}))
 									} else if cfg.ReconcileWorkerEnabled {
 										logger.Info("Reconcile worker skipped — Hyperswitch not enabled")
 									}
-												feat(cloud): GDPR data export and automatic backup cron

											
										
										
											2026-02-25 12:35:16 +00:00
+									// v0.802: Start Cloud Backup Worker (copies cloud files to backup prefix every 24h)
 									if cfg.S3StorageService != nil {
 										backupWorker := services.NewCloudBackupWorker(db.GormDB, cfg.S3StorageService, logger)
 										backupCtx, backupCancel := context.WithCancel(context.Background())
 										go backupWorker.Start(backupCtx)
 										logger.Info("Cloud Backup Worker started (24h interval)")
 										shutdownManager.Register(shutdown.NewShutdownFunc("cloud_backup_worker", func(ctx context.Context) error {
 											backupCancel()
 											return nil
 										}))
 									}
-												feat(v0.802): frontend Cloud/Gear, MSW, docs, scope v0.803, archive

- Cloud: CloudFileVersions, CloudShareModal, versions/share in CloudView
- Gear: GearDocumentsTab, GearRepairsTab, warranty badge, initialTab
- MSW: cloud versions/share, gear documents/repairs, tags suggest
- Stories: CloudFileVersions, CloudShareModal, GearDetailModal variants
- gearService: listDocuments, uploadDocument, deleteDocument, listRepairs, createRepair, deleteRepair
- cloudService: listVersions, restoreVersion, shareFile, getSharedFile
- gear_warranty_notifier: 24h ticker, notifications for expiring warranty
- tag_handler_test: unit tests
- docs: API_REFERENCE, CHANGELOG, PROJECT_STATE, FEATURE_STATUS v0.802
- SCOPE_CONTROL, .cursorrules: scope v0.803
- archive: V0_802_RELEASE_SCOPE, RETROSPECTIVE_V0802

											
										
										
											2026-02-25 13:00:58 +00:00
+									// v0.802: Start Gear Warranty Notifier (sends notifications when warranty expires in 30 days)
 									notificationService := services.NewNotificationService(db, logger)
 									warrantyNotifier := services.NewGearWarrantyNotifier(db.GormDB, notificationService, logger)
 									warrantyCtx, warrantyCancel := context.WithCancel(context.Background())
 									go warrantyNotifier.Start(warrantyCtx)
 									logger.Info("Gear Warranty Notifier started (24h interval)")
 									shutdownManager.Register(shutdown.NewShutdownFunc("gear_warranty_notifier", func(ctx context.Context) error {
 										warrantyCancel()
 										return nil
 									}))
-												feat(v0.10.5): Notifications complètes — F551-F555

F555: Backend pagination/filter GetNotifications (type, page, limit) + frontend pagination
F551: WebSocket real-time — backend inject chat hub, send on CreateNotification; frontend useChat invalidates
F553: Quiet hours — migration 132, CreateNotification skips push/WS, UI in PushPreferencesSection
F554: Notification grouping — migration 133, group_key/actor_count for like/comment, UI format
F552: Weekly digest — migration 134, NotificationDigestWorker, email template, prefs UI

Acceptance: no gamification notif; defaults unchanged; individual toggles for marketing

											
										
										
											2026-03-10 09:02:21 +00:00
+									// v0.10.5 F552: Weekly notification digest (runs on Sunday)
 									if cfg.JobWorker != nil {
 										digestWorker := services.NewNotificationDigestWorker(db.GormDB, cfg.JobWorker, logger)
 										digestCtx, digestCancel := context.WithCancel(context.Background())
 										go digestWorker.Start(digestCtx)
 										logger.Info("Notification digest worker started (weekly on Sunday)")
 										shutdownManager.Register(shutdown.NewShutdownFunc("notification_digest_worker", func(ctx context.Context) error {
 											digestCancel()
 											return nil
 										}))
 									}
-												feat(gdpr): v0.10.8 portabilité données - export ZIP async, suppression compte, hard delete cron

- Export: table data_exports, POST /me/export (202), GET /me/exports, messages+playback_history
- Notification email quand ZIP prêt, rate limit 3/jour
- Suppression: keep_public_tracks, anonymisation PII complète (users, user_profiles)
- HardDeleteWorker: final anonymization après 30 jours
- Frontend: POST export, checkbox keep_public_tracks
- MSW handlers pour Storybook

											
										
										
											2026-03-10 12:57:04 +00:00
+									// v0.10.8 F065: Hard delete worker (GDPR - final anonymization after 30 days)
 									if os.Getenv("HARD_DELETE_CRON_ENABLED") != "false" {
-												fix(backend): J4 — GDPR-compliant hard delete with Redis and ES cleanup

Closes TODO(HIGH-007). When the hard-delete worker anonymizes a user past
their recovery deadline, it now also cleans the user's residual data from
Redis and Elasticsearch, not just PostgreSQL. Without this, a user who
invoked their right to erasure would still appear in cached feed/profile
responses and in ES search results for up to the next reindex cycle.

Worker changes (internal/workers/hard_delete_worker.go):

  WithRedis / WithElasticsearch builder methods inject the clients. Both
  are optional: if either is nil (feature disabled or unreachable), the
  corresponding cleanup is skipped with a debug log and the worker keeps
  going. Partial progress beats panic.

  cleanRedisKeys uses SCAN with a cursor loop (COUNT 100), NEVER KEYS —
  KEYS would block the Redis server on multi-million-key deployments.
  Pattern is user:{id}:*. Transient SCAN errors retry up to 3 times with
  100ms * retry linear backoff; persistent errors return without panic.
  DEL errors on a batch are logged but non-fatal so subsequent batches
  are still attempted.

  cleanESDocs hits three indices independently:
    - users index: DELETE doc by _id (the user UUID); 404 treated as
      success (already gone = desired state)
    - tracks index: DeleteByQuery with a terms filter on _id, using the
      list of track IDs collected from PostgreSQL BEFORE anonymization
    - playlists index: same pattern as tracks
  A failure on one index does not prevent the others from being tried;
  the first error is returned so the caller can log.

  Track/playlist IDs are pre-collected (collectTrackIDs, collectPlaylistIDs)
  before the UPDATE anonymization runs, because the anonymization does NOT
  cascade (no DELETE on users), so tracks and playlists rows remain with
  their creator_id / user_id intact and resolvable at query time.

Wiring (cmd/api/main.go):

  The worker now receives cfg.RedisClient directly, and an optional ES
  client built from elasticsearch.LoadConfig() + NewClient. If ES is
  disabled or unreachable at startup, the worker logs a warning and
  proceeds with Redis-only cleanup.

Tests (internal/workers/hard_delete_worker_test.go, +260 lines):

  Pure-function unit tests:
    - TestUUIDsToStrings
    - TestEsIndexNameFor
  Nil-client safety tests:
    - TestCleanRedisKeys_NilClientIsNoop
    - TestCleanESDocs_NilClientIsNoop
  ES mock-server tests (httptest.Server mimicking /_doc and
  /_delete_by_query endpoints with valid ES 8.11 responses):
    - TestCleanESDocs_CallsAllThreeIndices — verifies the three expected
      HTTP calls land with the right paths and request bodies containing
      the provided UUIDs
    - TestCleanESDocs_SkipsEmptyIDLists — verifies no DeleteByQuery is
      issued when the ID lists are empty
  Redis testcontainer integration test (gated by VEZA_SKIP_INTEGRATION):
    - TestCleanRedisKeys_Integration — seeds 154 keys (4 fixed + 150 bulk
      to force the SCAN loop past a single batch) plus 4 unrelated keys
      from another user / global, runs cleanRedisKeys, asserts all 154
      own keys are gone and all 4 unrelated keys remain.

Verification:
  go build ./...                                                OK
  go vet ./...                                                  OK
  VEZA_SKIP_INTEGRATION=1 go test ./internal/workers/... short  OK
  go test ./internal/workers/ -run TestCleanRedisKeys_Integration
    → testcontainers spins redis:7-alpine, test passes in 1.34s

Out of J4 scope (noted for a follow-up):
  - No "activity" ES index exists in the codebase today (the audit plan
    mentioned it as a possible target). The three real indices with user
    data — users, tracks, playlists — are all now cleaned.
  - Track artist strings (free-form) may still contain the user's
    display name as a cached value in the tracks index after this
    cleanup. Actual user-owned tracks are deleted here, but if a third
    party's track referenced the removed user in its artist field, that
    reference is not touched. Strict RGPD on that edge case is a
    separate ticket.

Refs: AUDIT_REPORT.md §8.5, §10 P5, §12 item 1

											
										
										
											2026-04-15 10:25:39 +00:00
+										// Optional ES client for the worker's RGPD cleanup (users/tracks/playlists indices).
 										// Non-fatal if ES is disabled or unreachable — the worker will just skip ES cleanup.
 										var hardDeleteESClient *vezaes.Client
 										if esCfg := vezaes.LoadConfig(); esCfg.Enabled {
 											if esc, esErr := vezaes.NewClient(esCfg, logger); esErr != nil {
 												logger.Warn("Elasticsearch unavailable for hard delete worker, ES cleanup disabled",
 													zap.Error(esErr))
 											} else {
 												hardDeleteESClient = esc
 											}
 										}
 										hardDeleteWorker := workers.NewHardDeleteWorker(db.GormDB, logger, 24*time.Hour).
 											WithRedis(cfg.RedisClient).
 											WithElasticsearch(hardDeleteESClient)
-												feat(gdpr): v0.10.8 portabilité données - export ZIP async, suppression compte, hard delete cron

- Export: table data_exports, POST /me/export (202), GET /me/exports, messages+playback_history
- Notification email quand ZIP prêt, rate limit 3/jour
- Suppression: keep_public_tracks, anonymisation PII complète (users, user_profiles)
- HardDeleteWorker: final anonymization après 30 jours
- Frontend: POST export, checkbox keep_public_tracks
- MSW handlers pour Storybook

											
										
										
											2026-03-10 12:57:04 +00:00
+										hardDeleteCtx, hardDeleteCancel := context.WithCancel(context.Background())
 										go hardDeleteWorker.Start(hardDeleteCtx)
-												fix(backend): J4 — GDPR-compliant hard delete with Redis and ES cleanup

Closes TODO(HIGH-007). When the hard-delete worker anonymizes a user past
their recovery deadline, it now also cleans the user's residual data from
Redis and Elasticsearch, not just PostgreSQL. Without this, a user who
invoked their right to erasure would still appear in cached feed/profile
responses and in ES search results for up to the next reindex cycle.

Worker changes (internal/workers/hard_delete_worker.go):

  WithRedis / WithElasticsearch builder methods inject the clients. Both
  are optional: if either is nil (feature disabled or unreachable), the
  corresponding cleanup is skipped with a debug log and the worker keeps
  going. Partial progress beats panic.

  cleanRedisKeys uses SCAN with a cursor loop (COUNT 100), NEVER KEYS —
  KEYS would block the Redis server on multi-million-key deployments.
  Pattern is user:{id}:*. Transient SCAN errors retry up to 3 times with
  100ms * retry linear backoff; persistent errors return without panic.
  DEL errors on a batch are logged but non-fatal so subsequent batches
  are still attempted.

  cleanESDocs hits three indices independently:
    - users index: DELETE doc by _id (the user UUID); 404 treated as
      success (already gone = desired state)
    - tracks index: DeleteByQuery with a terms filter on _id, using the
      list of track IDs collected from PostgreSQL BEFORE anonymization
    - playlists index: same pattern as tracks
  A failure on one index does not prevent the others from being tried;
  the first error is returned so the caller can log.

  Track/playlist IDs are pre-collected (collectTrackIDs, collectPlaylistIDs)
  before the UPDATE anonymization runs, because the anonymization does NOT
  cascade (no DELETE on users), so tracks and playlists rows remain with
  their creator_id / user_id intact and resolvable at query time.

Wiring (cmd/api/main.go):

  The worker now receives cfg.RedisClient directly, and an optional ES
  client built from elasticsearch.LoadConfig() + NewClient. If ES is
  disabled or unreachable at startup, the worker logs a warning and
  proceeds with Redis-only cleanup.

Tests (internal/workers/hard_delete_worker_test.go, +260 lines):

  Pure-function unit tests:
    - TestUUIDsToStrings
    - TestEsIndexNameFor
  Nil-client safety tests:
    - TestCleanRedisKeys_NilClientIsNoop
    - TestCleanESDocs_NilClientIsNoop
  ES mock-server tests (httptest.Server mimicking /_doc and
  /_delete_by_query endpoints with valid ES 8.11 responses):
    - TestCleanESDocs_CallsAllThreeIndices — verifies the three expected
      HTTP calls land with the right paths and request bodies containing
      the provided UUIDs
    - TestCleanESDocs_SkipsEmptyIDLists — verifies no DeleteByQuery is
      issued when the ID lists are empty
  Redis testcontainer integration test (gated by VEZA_SKIP_INTEGRATION):
    - TestCleanRedisKeys_Integration — seeds 154 keys (4 fixed + 150 bulk
      to force the SCAN loop past a single batch) plus 4 unrelated keys
      from another user / global, runs cleanRedisKeys, asserts all 154
      own keys are gone and all 4 unrelated keys remain.

Verification:
  go build ./...                                                OK
  go vet ./...                                                  OK
  VEZA_SKIP_INTEGRATION=1 go test ./internal/workers/... short  OK
  go test ./internal/workers/ -run TestCleanRedisKeys_Integration
    → testcontainers spins redis:7-alpine, test passes in 1.34s

Out of J4 scope (noted for a follow-up):
  - No "activity" ES index exists in the codebase today (the audit plan
    mentioned it as a possible target). The three real indices with user
    data — users, tracks, playlists — are all now cleaned.
  - Track artist strings (free-form) may still contain the user's
    display name as a cached value in the tracks index after this
    cleanup. Actual user-owned tracks are deleted here, but if a third
    party's track referenced the removed user in its artist field, that
    reference is not touched. Strict RGPD on that edge case is a
    separate ticket.

Refs: AUDIT_REPORT.md §8.5, §10 P5, §12 item 1

											
										
										
											2026-04-15 10:25:39 +00:00
+										logger.Info("Hard delete worker started (24h interval)",
 											zap.Bool("redis_cleanup", cfg.RedisClient != nil),
 											zap.Bool("es_cleanup", hardDeleteESClient != nil),
 										)
-												feat(gdpr): v0.10.8 portabilité données - export ZIP async, suppression compte, hard delete cron

- Export: table data_exports, POST /me/export (202), GET /me/exports, messages+playback_history
- Notification email quand ZIP prêt, rate limit 3/jour
- Suppression: keep_public_tracks, anonymisation PII complète (users, user_profiles)
- HardDeleteWorker: final anonymization après 30 jours
- Frontend: POST export, checkbox keep_public_tracks
- MSW handlers pour Storybook

											
										
										
											2026-03-10 12:57:04 +00:00
 										shutdownManager.Register(shutdown.NewShutdownFunc("hard_delete_worker", func(ctx context.Context) error {
 											hardDeleteWorker.Stop()
 											hardDeleteCancel()
 											return nil
 										}))
 									} else {
 										logger.Info("Hard delete worker disabled (HARD_DELETE_CRON_ENABLED=false)")
 									}
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									// Configuration du mode Gin
 									// Correction: Utilisation directe de la variable d'env car non exposée dans Config
 									appEnv := os.Getenv("APP_ENV")
 									if appEnv == "production" {
 										gin.SetMode(gin.ReleaseMode)
 									} else {
 										gin.SetMode(gin.DebugMode)
 									}
 									// Créer le router Gin
 									router := gin.New()
-												fix(v0.12.6.1): remediate 2 CRITICAL + 10 HIGH + 1 MEDIUM pentest findings

Security fixes implemented:

CRITICAL:
- CRIT-001: IDOR on chat rooms — added IsRoomMember check before
  returning room data or message history (returns 404, not 403)
- CRIT-002: play_count/like_count exposed publicly — changed JSON
  tags to "-" so they are never serialized in API responses

HIGH:
- HIGH-001: TOCTOU race on marketplace downloads — transaction +
  SELECT FOR UPDATE on GetDownloadURL
- HIGH-002: HS256 in production docker-compose — replaced JWT_SECRET
  with JWT_PRIVATE_KEY_PATH / JWT_PUBLIC_KEY_PATH (RS256)
- HIGH-003: context.Background() bypass in user repository — full
  context propagation from handlers → services → repository (29 files)
- HIGH-004: Race condition on promo codes — SELECT FOR UPDATE
- HIGH-005: Race condition on exclusive licenses — SELECT FOR UPDATE
- HIGH-006: Rate limiter IP spoofing — SetTrustedProxies(nil) default
- HIGH-007: RGPD hard delete incomplete — added cleanup for sessions,
  settings, follows, notifications, audit_logs anonymization
- HIGH-008: RTMP callback auth weak — fail-closed when unconfigured,
  header-only (no query param), constant-time compare
- HIGH-009: Co-listening host hijack — UpdateHostState now takes *Conn
  and verifies IsHost before processing
- HIGH-010: Moderator self-strike — added issuedBy != userID check

MEDIUM:
- MEDIUM-001: Recovery codes used math/rand — replaced with crypto/rand
- MEDIUM-005: Stream token forgeable — resolved by HIGH-002 (RS256)

Updated REMEDIATION_MATRIX: 14 findings marked ✅ CORRIGÉ.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-12 04:40:53 +00:00
+									// SECURITY(HIGH-006): Restrict trusted proxies to prevent IP spoofing via X-Forwarded-For.
 									// Default: trust nothing (c.ClientIP() returns RemoteAddr only).
 									// Set TRUSTED_PROXIES="10.0.0.1,10.0.0.2" if behind a known reverse proxy/load balancer.
 									router.SetTrustedProxies(nil)
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									// Middleware globaux (Logger, Recovery) recommandés par ORIGIN
 									router.Use(gin.Logger(), gin.Recovery())
 									// Configuration des routes
 									apiRouter := api.NewAPIRouter(db, cfg) // Instantiate APIRouter
-												fix(backend): replace panic/Fatal with graceful error when Redis down (audit 1.4, P0)

- Add early validation in Setup() returning error if Redis nil in production
- Remove panic/Fatal from routes_core.go and router.go applyCSRFProtection
- Handle Setup() error in cmd/api/main.go and cmd/modern-server/main.go
- Mark audit item 1.4 as done

											
										
										
											2026-02-15 13:05:20 +00:00
+									if err := apiRouter.Setup(router); err != nil {
 										logger.Error("Failed to setup API routes", zap.Error(err))
 										os.Exit(1)
 									}
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
-												feat(workers): hourly cleanup of orphan tracks stuck in processing

Upload flow: POST creates a track row with `status=processing` and
writes the file at `file_path`. If the uploader process dies (OOM,
SIGKILL during deploy, disk wipe) between row-create and status-update,
the row stays in `processing` forever with a `file_path` that doesn't
exist. The library UI shows a ghost track the user can never play,
never reach, and only partially delete.

New worker:

  * `jobs/cleanup_orphan_tracks.go` — `CleanupOrphanTracks` queries
    tracks with `status=processing AND created_at < NOW()-1h`, stats
    the `file_path`, and flips the row to `status=failed` with
    `status_message = "orphan cleanup: file missing on disk after >1h
    in processing"`. Never deletes; never touches present files or
    rows already in another state. Safe to run repeatedly.
  * `ScheduleOrphanTracksCleanup(db, logger)` runs once at boot and
    then every hour thereafter. Wired in `cmd/api/main.go` right after
    route setup so restarts trigger an immediate scan.
  * Threshold exported as `OrphanTrackAgeThreshold` constant so tests
    and future tuning don't need to edit the worker.

Tests: 5 cases in `cleanup_orphan_tracks_test.go`:
  - `_FlipsStuckMissingFile` happy path
  - `_LeavesFilePresent` (slow uploads must not be failed)
  - `_LeavesRecent` (below threshold)
  - `_IgnoresAlreadyFailed` (idempotent)
  - `_NilDatabaseIsNoop` (safety)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-16 12:57:24 +00:00
+									// v1.0.4: Hourly cleanup of tracks stuck in `processing` whose upload file
 									// vanished (crash, SIGKILL, disk wipe). Keeps the tracks table honest.
 									jobs.ScheduleOrphanTracksCleanup(db, logger)
-												feat(webhooks): persist raw hyperswitch payloads to audit log — v1.0.7 item E

Every POST /webhooks/hyperswitch delivery now writes a row to
`hyperswitch_webhook_log` regardless of signature-valid or
processing outcome. Captures both legitimate deliveries and attack
probes — a forensics query now has the actual bytes to read, not
just a "webhook rejected" log line. Disputes (axis-1 P1.6) ride
along: the log captures dispute.* events alongside payment and
refund events, ready for when disputes get a handler.

Table shape (migration 984):
  * payload TEXT — readable in psql, invalid UTF-8 replaced with
    empty (forensics value is in headers + ip + timing for those
    attacks, not the binary body).
  * signature_valid BOOLEAN + partial index for "show me attack
    attempts" being instantaneous.
  * processing_result TEXT — 'ok' / 'error: <msg>' /
    'signature_invalid' / 'skipped'. Matches the P1.5 action
    semantic exactly.
  * source_ip, user_agent, request_id — forensics essentials.
    request_id is captured from Hyperswitch's X-Request-Id header
    when present, else a server-side UUID so every row correlates
    to VEZA's structured logs.
  * event_type — best-effort extract from the JSON payload, NULL
    on malformed input.

Hardening:
  * 64KB body cap via io.LimitReader rejects oversize with 413
    before any INSERT — prevents log-spam DoS.
  * Single INSERT per delivery with final state; no two-phase
    update race on signature-failure path. signature_invalid and
    processing-error rows both land.
  * DB persistence failures are logged but swallowed — the
    endpoint's contract is to ack Hyperswitch, not perfect audit.

Retention sweep:
  * CleanupHyperswitchWebhookLog in internal/jobs, daily tick,
    batched DELETE (10k rows + 100ms pause) so a large backlog
    doesn't lock the table.
  * HYPERSWITCH_WEBHOOK_LOG_RETENTION_DAYS (default 90).
  * Same goroutine-ticker pattern as ScheduleOrphanTracksCleanup.
  * Wired in cmd/api/main.go alongside the existing cleanup jobs.

Tests: 5 in webhook_log_test.go (persistence, request_id auto-gen,
invalid-JSON leaves event_type empty, invalid-signature capture,
extractEventType 5 sub-cases) + 4 in cleanup_hyperswitch_webhook_
log_test.go (deletes-older-than, noop, default-on-zero,
context-cancel). Migration 984 applied cleanly to local Postgres;
all indexes present.

Also (v107-plan.md):
  * Item G acceptance gains an explicit Idempotency-Key threading
    requirement with an empty-key loud-fail test — "literally
    copy-paste D's 4-line test skeleton". Closes the risk that
    item G silently reopens the HTTP-retry duplicate-charge
    exposure D closed.

Out of scope for E (noted in CHANGELOG):
  * Rate limit on the endpoint — pre-existing middleware covers
    it at the router level; adding a per-endpoint limit is
    separate scope.
  * Readable-payload SQL view — deferred, the TEXT column is
    already human-readable; a convenience view is a nice-to-have
    not a ship-blocker.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 00:44:58 +00:00
+									// v1.0.7 item E: daily sweep of hyperswitch_webhook_log rows older than
 									// HYPERSWITCH_WEBHOOK_LOG_RETENTION_DAYS (default 90). Batched so a large
 									// backlog doesn't lock the table.
 									jobs.ScheduleHyperswitchWebhookLogCleanup(db, logger, cfg.HyperswitchWebhookLogRetentionDays)
-												feat(metrics): ledger-health gauges + alert rules — v1.0.7 item F

Five Prometheus gauges + reconciler metrics + Grafana dashboard +
three alert rules. Closes axis-1 P1.8 and adds observability for
item C's reconciler (user review: "F should include reconciler_*
metrics, otherwise tag is blind on the worker we just shipped").

Gauges (veza_ledger_, sampled every 60s):
  * orphan_refund_rows — THE canary. Pending refunds with empty
    hyperswitch_refund_id older than 5m = Phase 2 crash in
    RefundOrder. Alert: > 0 for 5m → page.
  * stuck_orders_pending — order pending > 30m with non-empty
    payment_id. Alert: > 0 for 10m → page.
  * stuck_refunds_pending — refund pending > 30m with hs_id.
  * failed_transfers_at_max_retry — permanently_failed rows.
  * reversal_pending_transfers — item B rows stuck > 30m.

Reconciler metrics (veza_reconciler_):
  * actions_total{phase} — counter by phase.
  * orphan_refunds_total — two-phase-bug canary.
  * sweep_duration_seconds — exponential histogram.
  * last_run_timestamp — alert: stale > 2h → page (worker dead).

Implementation notes:
  * Sampler thresholds hardcoded to match reconciler defaults —
    intentional mismatch allowed (alerts fire while reconciler
    already working = correct behavior).
  * Query error sets gauge to -1 (sentinel for "sampler broken").
  * marketplace package routes through monitoring recorders so it
    doesn't import prometheus directly.
  * Sampler runs regardless of Hyperswitch enablement; gauges
    default 0 when pipeline idle.
  * Graceful shutdown wired in cmd/api/main.go.

Alert rules in config/alertmanager/ledger.yml with runbook
pointers + detailed descriptions — each alert explains WHAT
happened, WHY the reconciler may not resolve it, and WHERE to
look first.

Grafana dashboard config/grafana/dashboards/ledger-health.json —
top row = 5 stat panels (orphan first, color-coded red on > 0),
middle row = trend timeseries + reconciler action rate by phase,
bottom row = sweep duration p50/p95/p99 + seconds-since-last-tick
+ orphan cumulative.

Tests — 6 cases, all green (sqlite :memory:):
  * CountsStuckOrdersPending (includes the filter on
    non-empty payment_id)
  * StuckOrdersZeroWhenAllCompleted
  * CountsOrphanRefunds (THE canary)
  * CountsStuckRefundsWithHsID (gauge-orthogonality check)
  * CountsFailedAndReversalPendingTransfers
  * ReconcilerRecorders (counter + gauge shape)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-18 01:40:14 +00:00
+									// v1.0.7 item F: 60s sampler feeds five ledger-health gauges +
 									// reconciler_* counters. Grafana dashboard in config/grafana/ledger.json,
 									// alert rules in config/alertmanager/ledger.yml.
 									ledgerSamplerCtx, ledgerSamplerCancel := context.WithCancel(context.Background())
 									monitoring.ScheduleLedgerHealthSampler(ledgerSamplerCtx, db.GormDB, logger)
 									shutdownManager.Register(shutdown.NewShutdownFunc("ledger_health_sampler", func(ctx context.Context) error {
 										ledgerSamplerCancel()
 										return nil
 									}))
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									// Configuration du serveur HTTP
 									port := fmt.Sprintf("%d", cfg.AppPort)
 									if cfg.AppPort == 0 {
 										port = "8080"
 									}
 									server := &http.Server{
 										Addr:         fmt.Sprintf(":%s", port),
 										Handler:      router,
 										ReadTimeout:  30 * time.Second, // Standards ORIGIN
 										WriteTimeout: 30 * time.Second,
 									}
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									// BE-SVC-017: Enregistrer tous les services pour shutdown gracieux
 									// Enregistrer le serveur HTTP
 									shutdownManager.Register(shutdown.NewShutdownFunc("http_server", func(ctx context.Context) error {
 										return server.Shutdown(ctx)
 									}))
 									// Enregistrer la configuration (ferme DB, Redis, RabbitMQ, etc.)
 									shutdownManager.Register(shutdown.NewShutdownFunc("config", func(ctx context.Context) error {
 										return cfg.Close()
 									}))
 									// Enregistrer le logger pour flush final
 									shutdownManager.Register(shutdown.NewShutdownFunc("logger", func(ctx context.Context) error {
 										if logger != nil {
 											return logger.Sync()
 										}
 										return nil
 									}))
 									// Enregistrer Sentry pour flush final
 									if cfg.SentryDsn != "" {
 										shutdownManager.Register(shutdown.NewShutdownFunc("sentry", func(ctx context.Context) error {
 											sentry.Flush(2 * time.Second)
 											return nil
 										}))
 									}
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									// Gestion de l'arrêt gracieux
 									quit := make(chan os.Signal, 1)
 									signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
 									go func() {
 										logger.Info("🌐 Serveur HTTP démarré", zap.String("port", port))
 										if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 											logger.Fatal("❌ Erreur du serveur HTTP", zap.Error(err))
 										}
 									}()
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									// Attendre le signal d'arrêt
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									<-quit
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									logger.Info("🔄 Signal d'arrêt reçu, démarrage du shutdown gracieux...")
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									// BE-SVC-017: Arrêt gracieux coordonné de tous les services
 									shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
 									defer shutdownCancel()
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+									if err := shutdownManager.Shutdown(shutdownCtx); err != nil {
 										logger.Error("❌ Erreur lors du shutdown gracieux", zap.Error(err))
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									} else {
-												[BE-SVC-017] be-svc: Implement graceful shutdown

- Created ShutdownManager for coordinated graceful shutdown of all services
- Added Shutdowner interface for services that need graceful shutdown
- Implemented parallel shutdown with individual timeouts (10s per service)
- Added global shutdown timeout (30s total)
- Integrated shutdown manager in main.go for:
  - HTTP server shutdown
  - JobWorker cancellation
  - Config.Close() (DB, Redis, RabbitMQ)
  - Logger sync
  - Sentry flush
- Added comprehensive unit tests for shutdown manager
- Prevents registration of new services during shutdown

Phase: PHASE-6
Priority: P2
Progress: 113/267 (42.32%)

											
										
										
											2025-12-24 16:03:11 +00:00
+										logger.Info("✅ Shutdown gracieux terminé avec succès")
-												adding initial backend API (Go)

											
										
										
											2025-12-03 19:29:37 +00:00
+									}
 								}