2026-04-18 01:40:14 +00:00
package monitoring
import (
"context"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"go.uber.org/zap"
"gorm.io/gorm"
)
// Ledger-health metrics (v1.0.7 item F, audit P1.8).
//
// Five gauges expose stuck-state counts so ops dashboards + alert
// rules can spot a money-movement pipeline stall before a customer
// does. Sampled every LedgerSamplerInterval via a 60s ticker that
// runs a cheap indexed SELECT COUNT(*) per gauge. The query cost is
// bounded: each clause filters on `status + created_at`, both
// indexed on the relevant tables.
//
// Paired with reconciler_* counters so the dashboard can tell the
// whole story at a glance: "we have N stuck orders and the
// reconciler has resolved M of them today."
//
// Plus two alert rules in config/alertmanager/ledger.yml:
2026-04-27 10:03:55 +00:00
// - ledger_stuck_orders_pending > 0 for 10m → page
// - ledger_orphan_refund_rows > 0 for 5m → page (bug in two-phase
2026-04-18 01:40:14 +00:00
// commit between DB and PSP — immediate ops attention)
var (
// LedgerStuckOrdersPending is the count of orders sitting in
// `pending` past the staleness threshold (30m by default). Should
// be 0 in steady state; non-zero means webhooks from the PSP
// stopped arriving or our endpoint is rejecting them.
LedgerStuckOrdersPending = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "veza_ledger_stuck_orders_pending" ,
Help : "Orders in 'pending' status older than the staleness threshold (30m). Non-zero triggers ops alert." ,
} )
// LedgerStuckRefundsPending is the count of refunds with a PSP id
// but still in `pending` past the threshold. Symptom: the refund
// was accepted by Hyperswitch but our webhook handler never
// received the terminal event.
LedgerStuckRefundsPending = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "veza_ledger_stuck_refunds_pending" ,
Help : "Refunds with hyperswitch_refund_id set but status still 'pending' older than 30m." ,
} )
// LedgerFailedTransfersAtMaxRetry is the count of seller_transfers
// that exhausted the retry worker's attempts. Non-zero = ops
// investigation required; the Stripe Connect side is stuck
// somehow.
LedgerFailedTransfersAtMaxRetry = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "veza_ledger_failed_transfers_at_max_retry" ,
Help : "seller_transfers with status='permanently_failed' — retry worker gave up." ,
} )
// LedgerOrphanRefundRows is THE alert gauge. Non-zero means a
// Refund row exists in 'pending' with no hyperswitch_refund_id
// (past 5m) — i.e., Phase 1 ran, Phase 2 crashed. This is a
// two-phase-commit bug. Page on > 0 for 5m.
LedgerOrphanRefundRows = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "veza_ledger_orphan_refund_rows" ,
Help : "Refunds pending with empty hyperswitch_refund_id older than 5m. Non-zero = crash between Phase 1 and Phase 2 of RefundOrder, page ops immediately." ,
} )
// LedgerReversalPendingTransfers tracks rows waiting for the
// Stripe reversal worker (v1.0.7 item B). Non-zero during a
// reversal is normal and transient; sustained > 0 means the
// reversal worker is stuck or Stripe Connect is down.
LedgerReversalPendingTransfers = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "veza_ledger_reversal_pending_transfers" ,
Help : "seller_transfers in 'reversal_pending' older than 30m — reversal worker is behind." ,
} )
// --- Reconciler metrics (v1.0.7 item F, covers item C) ---
// ReconcilerActionsTotal counts actions the reconciliation worker
// has taken, labelled by phase. Lets dashboards show "reconciler
// fixed N stuck orders this week" vs "orphan refunds auto-failed
// this week" without re-parsing logs.
ReconcilerActionsTotal = promauto . NewCounterVec ( prometheus . CounterOpts {
Name : "veza_reconciler_actions_total" ,
Help : "Reconciler actions taken, by phase (stuck_orders|stuck_refunds|orphan_refunds)." ,
} , [ ] string { "phase" } )
// ReconcilerOrphanRefundsTotal is the load-bearing counter for
// detecting two-phase-commit bugs. Each orphan refund the
// reconciler auto-fails increments this; a sustained non-zero
// rate is the ledger-health equivalent of a fire alarm.
ReconcilerOrphanRefundsTotal = promauto . NewCounter ( prometheus . CounterOpts {
Name : "veza_reconciler_orphan_refunds_total" ,
Help : "Orphan refunds (Phase 2 crash) auto-failed by the reconciler. Non-zero rate = investigate root cause." ,
} )
// ReconcilerSweepDurationSeconds measures one RunOnce tick so
// slow sweeps show up in alerting.
ReconcilerSweepDurationSeconds = promauto . NewHistogram ( prometheus . HistogramOpts {
Name : "veza_reconciler_sweep_duration_seconds" ,
Help : "Duration of one ReconcileHyperswitchWorker.RunOnce tick in seconds." ,
Buckets : prometheus . ExponentialBuckets ( 0.1 , 2 , 10 ) , // 0.1s to ~100s
} )
// ReconcilerLastRunTimestamp is set to time.Now().Unix() at the
// end of every RunOnce. Alert rule: `now() - timestamp >
// 2 * RECONCILE_INTERVAL` → worker is dead.
ReconcilerLastRunTimestamp = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "veza_reconciler_last_run_timestamp" ,
Help : "Unix timestamp of the last successful ReconcileHyperswitchWorker tick. Stale = worker dead." ,
} )
)
// Convenience recorders for the reconciler worker. Called from
// internal/core/marketplace/reconcile_hyperswitch.go so that package
// doesn't import Prometheus directly (keeps marketplace clean of
// observability plumbing).
func RecordReconcilerAction ( phase string ) {
ReconcilerActionsTotal . WithLabelValues ( phase ) . Inc ( )
}
func RecordReconcilerOrphanRefund ( ) {
ReconcilerOrphanRefundsTotal . Inc ( )
}
func RecordReconcilerSweepDuration ( d time . Duration ) {
ReconcilerSweepDurationSeconds . Observe ( d . Seconds ( ) )
ReconcilerLastRunTimestamp . Set ( float64 ( time . Now ( ) . Unix ( ) ) )
}
// --- Sampler ---
// LedgerSamplerInterval is how often the sampler re-queries the DB.
// 60s is the sweet spot for our volumes — scrape cost negligible, and
// stale-by-up-to-a-minute is fine for the alert-rule windows (10m /
// 5m).
const LedgerSamplerInterval = 60 * time . Second
// Staleness thresholds match the reconciler's defaults. If ops tunes
// the reconciler thresholds via env vars, the sampler still reports
// against these constants — intentional: the two serve different
// audiences (reconciler = auto-recovery, sampler = human visibility).
// A mismatch means alerts fire while the reconciler has already
// started working on the issue, which is the correct behavior.
const (
2026-04-27 10:03:55 +00:00
ledgerStuckOrderAgeThreshold = 30 * time . Minute
ledgerStuckRefundAgeThreshold = 30 * time . Minute
2026-04-18 01:40:14 +00:00
ledgerOrphanRefundAgeThreshold = 5 * time . Minute
ledgerReversalPendingThreshold = 30 * time . Minute
)
// SampleLedgerHealth runs the five count queries and updates the
// gauges. Safe to call concurrently (gauge writes are atomic). Any
// query error sets the corresponding gauge to -1 — a distinctive
// value that dashboards can filter on ("sampler is broken, don't
// trust the number").
//
// Exposed as a function rather than a method so tests can drive it
// directly against a sqlite in-memory DB.
func SampleLedgerHealth ( ctx context . Context , db * gorm . DB , logger * zap . Logger ) {
now := time . Now ( )
sample := func ( name string , gauge prometheus . Gauge , query string , args ... interface { } ) {
var count int64
if err := db . WithContext ( ctx ) . Raw ( query , args ... ) . Scan ( & count ) . Error ; err != nil {
logger . Error ( "ledger sampler: query failed" ,
zap . String ( "gauge" , name ) ,
zap . Error ( err ) )
gauge . Set ( - 1 )
return
}
gauge . Set ( float64 ( count ) )
}
sample ( "stuck_orders_pending" , LedgerStuckOrdersPending ,
` SELECT COUNT ( * ) FROM orders
WHERE status = ' pending '
AND hyperswitch_payment_id IS NOT NULL AND hyperswitch_payment_id < > ' '
AND created_at < ? ` ,
now . Add ( - ledgerStuckOrderAgeThreshold ) )
sample ( "stuck_refunds_pending" , LedgerStuckRefundsPending ,
` SELECT COUNT ( * ) FROM refunds
WHERE status = ' pending '
AND hyperswitch_refund_id IS NOT NULL AND hyperswitch_refund_id < > ' '
AND created_at < ? ` ,
now . Add ( - ledgerStuckRefundAgeThreshold ) )
sample ( "failed_transfers_at_max_retry" , LedgerFailedTransfersAtMaxRetry ,
` SELECT COUNT(*) FROM seller_transfers WHERE status = 'permanently_failed' ` )
sample ( "orphan_refund_rows" , LedgerOrphanRefundRows ,
` SELECT COUNT ( * ) FROM refunds
WHERE status = ' pending '
AND ( hyperswitch_refund_id IS NULL OR hyperswitch_refund_id = ' ' )
AND created_at < ? ` ,
now . Add ( - ledgerOrphanRefundAgeThreshold ) )
sample ( "reversal_pending_transfers" , LedgerReversalPendingTransfers ,
` SELECT COUNT ( * ) FROM seller_transfers
WHERE status = ' reversal_pending '
AND updated_at < ? ` ,
now . Add ( - ledgerReversalPendingThreshold ) )
}
// ScheduleLedgerHealthSampler runs SampleLedgerHealth once at startup
// (so dashboards aren't blank for the first minute) and then every
// LedgerSamplerInterval until ctx is cancelled.
func ScheduleLedgerHealthSampler ( ctx context . Context , db * gorm . DB , logger * zap . Logger ) {
ticker := time . NewTicker ( LedgerSamplerInterval )
go func ( ) {
defer ticker . Stop ( )
SampleLedgerHealth ( ctx , db , logger )
for {
select {
case <- ctx . Done ( ) :
logger . Info ( "Ledger health sampler stopped" )
return
case <- ticker . C :
SampleLedgerHealth ( ctx , db , logger )
}
}
} ( )
logger . Info ( "Ledger health sampler scheduled" ,
zap . Duration ( "interval" , LedgerSamplerInterval ) )
}