package monitoring import ( "context" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "go.uber.org/zap" "gorm.io/gorm" ) // Ledger-health metrics (v1.0.7 item F, audit P1.8). // // Five gauges expose stuck-state counts so ops dashboards + alert // rules can spot a money-movement pipeline stall before a customer // does. Sampled every LedgerSamplerInterval via a 60s ticker that // runs a cheap indexed SELECT COUNT(*) per gauge. The query cost is // bounded: each clause filters on `status + created_at`, both // indexed on the relevant tables. // // Paired with reconciler_* counters so the dashboard can tell the // whole story at a glance: "we have N stuck orders and the // reconciler has resolved M of them today." // // Plus two alert rules in config/alertmanager/ledger.yml: // * ledger_stuck_orders_pending > 0 for 10m → page // * ledger_orphan_refund_rows > 0 for 5m → page (bug in two-phase // commit between DB and PSP — immediate ops attention) var ( // LedgerStuckOrdersPending is the count of orders sitting in // `pending` past the staleness threshold (30m by default). Should // be 0 in steady state; non-zero means webhooks from the PSP // stopped arriving or our endpoint is rejecting them. LedgerStuckOrdersPending = promauto.NewGauge(prometheus.GaugeOpts{ Name: "veza_ledger_stuck_orders_pending", Help: "Orders in 'pending' status older than the staleness threshold (30m). Non-zero triggers ops alert.", }) // LedgerStuckRefundsPending is the count of refunds with a PSP id // but still in `pending` past the threshold. Symptom: the refund // was accepted by Hyperswitch but our webhook handler never // received the terminal event. LedgerStuckRefundsPending = promauto.NewGauge(prometheus.GaugeOpts{ Name: "veza_ledger_stuck_refunds_pending", Help: "Refunds with hyperswitch_refund_id set but status still 'pending' older than 30m.", }) // LedgerFailedTransfersAtMaxRetry is the count of seller_transfers // that exhausted the retry worker's attempts. Non-zero = ops // investigation required; the Stripe Connect side is stuck // somehow. LedgerFailedTransfersAtMaxRetry = promauto.NewGauge(prometheus.GaugeOpts{ Name: "veza_ledger_failed_transfers_at_max_retry", Help: "seller_transfers with status='permanently_failed' — retry worker gave up.", }) // LedgerOrphanRefundRows is THE alert gauge. Non-zero means a // Refund row exists in 'pending' with no hyperswitch_refund_id // (past 5m) — i.e., Phase 1 ran, Phase 2 crashed. This is a // two-phase-commit bug. Page on > 0 for 5m. LedgerOrphanRefundRows = promauto.NewGauge(prometheus.GaugeOpts{ Name: "veza_ledger_orphan_refund_rows", Help: "Refunds pending with empty hyperswitch_refund_id older than 5m. Non-zero = crash between Phase 1 and Phase 2 of RefundOrder, page ops immediately.", }) // LedgerReversalPendingTransfers tracks rows waiting for the // Stripe reversal worker (v1.0.7 item B). Non-zero during a // reversal is normal and transient; sustained > 0 means the // reversal worker is stuck or Stripe Connect is down. LedgerReversalPendingTransfers = promauto.NewGauge(prometheus.GaugeOpts{ Name: "veza_ledger_reversal_pending_transfers", Help: "seller_transfers in 'reversal_pending' older than 30m — reversal worker is behind.", }) // --- Reconciler metrics (v1.0.7 item F, covers item C) --- // ReconcilerActionsTotal counts actions the reconciliation worker // has taken, labelled by phase. Lets dashboards show "reconciler // fixed N stuck orders this week" vs "orphan refunds auto-failed // this week" without re-parsing logs. ReconcilerActionsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "veza_reconciler_actions_total", Help: "Reconciler actions taken, by phase (stuck_orders|stuck_refunds|orphan_refunds).", }, []string{"phase"}) // ReconcilerOrphanRefundsTotal is the load-bearing counter for // detecting two-phase-commit bugs. Each orphan refund the // reconciler auto-fails increments this; a sustained non-zero // rate is the ledger-health equivalent of a fire alarm. ReconcilerOrphanRefundsTotal = promauto.NewCounter(prometheus.CounterOpts{ Name: "veza_reconciler_orphan_refunds_total", Help: "Orphan refunds (Phase 2 crash) auto-failed by the reconciler. Non-zero rate = investigate root cause.", }) // ReconcilerSweepDurationSeconds measures one RunOnce tick so // slow sweeps show up in alerting. ReconcilerSweepDurationSeconds = promauto.NewHistogram(prometheus.HistogramOpts{ Name: "veza_reconciler_sweep_duration_seconds", Help: "Duration of one ReconcileHyperswitchWorker.RunOnce tick in seconds.", Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), // 0.1s to ~100s }) // ReconcilerLastRunTimestamp is set to time.Now().Unix() at the // end of every RunOnce. Alert rule: `now() - timestamp > // 2 * RECONCILE_INTERVAL` → worker is dead. ReconcilerLastRunTimestamp = promauto.NewGauge(prometheus.GaugeOpts{ Name: "veza_reconciler_last_run_timestamp", Help: "Unix timestamp of the last successful ReconcileHyperswitchWorker tick. Stale = worker dead.", }) ) // Convenience recorders for the reconciler worker. Called from // internal/core/marketplace/reconcile_hyperswitch.go so that package // doesn't import Prometheus directly (keeps marketplace clean of // observability plumbing). func RecordReconcilerAction(phase string) { ReconcilerActionsTotal.WithLabelValues(phase).Inc() } func RecordReconcilerOrphanRefund() { ReconcilerOrphanRefundsTotal.Inc() } func RecordReconcilerSweepDuration(d time.Duration) { ReconcilerSweepDurationSeconds.Observe(d.Seconds()) ReconcilerLastRunTimestamp.Set(float64(time.Now().Unix())) } // --- Sampler --- // LedgerSamplerInterval is how often the sampler re-queries the DB. // 60s is the sweet spot for our volumes — scrape cost negligible, and // stale-by-up-to-a-minute is fine for the alert-rule windows (10m / // 5m). const LedgerSamplerInterval = 60 * time.Second // Staleness thresholds match the reconciler's defaults. If ops tunes // the reconciler thresholds via env vars, the sampler still reports // against these constants — intentional: the two serve different // audiences (reconciler = auto-recovery, sampler = human visibility). // A mismatch means alerts fire while the reconciler has already // started working on the issue, which is the correct behavior. const ( ledgerStuckOrderAgeThreshold = 30 * time.Minute ledgerStuckRefundAgeThreshold = 30 * time.Minute ledgerOrphanRefundAgeThreshold = 5 * time.Minute ledgerReversalPendingThreshold = 30 * time.Minute ) // SampleLedgerHealth runs the five count queries and updates the // gauges. Safe to call concurrently (gauge writes are atomic). Any // query error sets the corresponding gauge to -1 — a distinctive // value that dashboards can filter on ("sampler is broken, don't // trust the number"). // // Exposed as a function rather than a method so tests can drive it // directly against a sqlite in-memory DB. func SampleLedgerHealth(ctx context.Context, db *gorm.DB, logger *zap.Logger) { now := time.Now() sample := func(name string, gauge prometheus.Gauge, query string, args ...interface{}) { var count int64 if err := db.WithContext(ctx).Raw(query, args...).Scan(&count).Error; err != nil { logger.Error("ledger sampler: query failed", zap.String("gauge", name), zap.Error(err)) gauge.Set(-1) return } gauge.Set(float64(count)) } sample("stuck_orders_pending", LedgerStuckOrdersPending, `SELECT COUNT(*) FROM orders WHERE status = 'pending' AND hyperswitch_payment_id IS NOT NULL AND hyperswitch_payment_id <> '' AND created_at < ?`, now.Add(-ledgerStuckOrderAgeThreshold)) sample("stuck_refunds_pending", LedgerStuckRefundsPending, `SELECT COUNT(*) FROM refunds WHERE status = 'pending' AND hyperswitch_refund_id IS NOT NULL AND hyperswitch_refund_id <> '' AND created_at < ?`, now.Add(-ledgerStuckRefundAgeThreshold)) sample("failed_transfers_at_max_retry", LedgerFailedTransfersAtMaxRetry, `SELECT COUNT(*) FROM seller_transfers WHERE status = 'permanently_failed'`) sample("orphan_refund_rows", LedgerOrphanRefundRows, `SELECT COUNT(*) FROM refunds WHERE status = 'pending' AND (hyperswitch_refund_id IS NULL OR hyperswitch_refund_id = '') AND created_at < ?`, now.Add(-ledgerOrphanRefundAgeThreshold)) sample("reversal_pending_transfers", LedgerReversalPendingTransfers, `SELECT COUNT(*) FROM seller_transfers WHERE status = 'reversal_pending' AND updated_at < ?`, now.Add(-ledgerReversalPendingThreshold)) } // ScheduleLedgerHealthSampler runs SampleLedgerHealth once at startup // (so dashboards aren't blank for the first minute) and then every // LedgerSamplerInterval until ctx is cancelled. func ScheduleLedgerHealthSampler(ctx context.Context, db *gorm.DB, logger *zap.Logger) { ticker := time.NewTicker(LedgerSamplerInterval) go func() { defer ticker.Stop() SampleLedgerHealth(ctx, db, logger) for { select { case <-ctx.Done(): logger.Info("Ledger health sampler stopped") return case <-ticker.C: SampleLedgerHealth(ctx, db, logger) } } }() logger.Info("Ledger health sampler scheduled", zap.Duration("interval", LedgerSamplerInterval)) }