veza/veza-backend-api/internal/monitoring/ledger_metrics.go

package monitoring

import (
	"context"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
	"go.uber.org/zap"
	"gorm.io/gorm"
)

// Ledger-health metrics (v1.0.7 item F, audit P1.8).
//
// Five gauges expose stuck-state counts so ops dashboards + alert
// rules can spot a money-movement pipeline stall before a customer
// does. Sampled every LedgerSamplerInterval via a 60s ticker that
// runs a cheap indexed SELECT COUNT(*) per gauge. The query cost is
// bounded: each clause filters on `status + created_at`, both
// indexed on the relevant tables.
//
// Paired with reconciler_* counters so the dashboard can tell the
// whole story at a glance: "we have N stuck orders and the
// reconciler has resolved M of them today."
//
// Plus two alert rules in config/alertmanager/ledger.yml:
//   * ledger_stuck_orders_pending > 0 for 10m → page
//   * ledger_orphan_refund_rows > 0 for 5m → page (bug in two-phase
//     commit between DB and PSP — immediate ops attention)
var (
	// LedgerStuckOrdersPending is the count of orders sitting in
	// `pending` past the staleness threshold (30m by default). Should
	// be 0 in steady state; non-zero means webhooks from the PSP
	// stopped arriving or our endpoint is rejecting them.
	LedgerStuckOrdersPending = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "veza_ledger_stuck_orders_pending",
		Help: "Orders in 'pending' status older than the staleness threshold (30m). Non-zero triggers ops alert.",
	})

	// LedgerStuckRefundsPending is the count of refunds with a PSP id
	// but still in `pending` past the threshold. Symptom: the refund
	// was accepted by Hyperswitch but our webhook handler never
	// received the terminal event.
	LedgerStuckRefundsPending = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "veza_ledger_stuck_refunds_pending",
		Help: "Refunds with hyperswitch_refund_id set but status still 'pending' older than 30m.",
	})

	// LedgerFailedTransfersAtMaxRetry is the count of seller_transfers
	// that exhausted the retry worker's attempts. Non-zero = ops
	// investigation required; the Stripe Connect side is stuck
	// somehow.
	LedgerFailedTransfersAtMaxRetry = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "veza_ledger_failed_transfers_at_max_retry",
		Help: "seller_transfers with status='permanently_failed' — retry worker gave up.",
	})

	// LedgerOrphanRefundRows is THE alert gauge. Non-zero means a
	// Refund row exists in 'pending' with no hyperswitch_refund_id
	// (past 5m) — i.e., Phase 1 ran, Phase 2 crashed. This is a
	// two-phase-commit bug. Page on > 0 for 5m.
	LedgerOrphanRefundRows = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "veza_ledger_orphan_refund_rows",
		Help: "Refunds pending with empty hyperswitch_refund_id older than 5m. Non-zero = crash between Phase 1 and Phase 2 of RefundOrder, page ops immediately.",
	})

	// LedgerReversalPendingTransfers tracks rows waiting for the
	// Stripe reversal worker (v1.0.7 item B). Non-zero during a
	// reversal is normal and transient; sustained > 0 means the
	// reversal worker is stuck or Stripe Connect is down.
	LedgerReversalPendingTransfers = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "veza_ledger_reversal_pending_transfers",
		Help: "seller_transfers in 'reversal_pending' older than 30m — reversal worker is behind.",
	})

	// --- Reconciler metrics (v1.0.7 item F, covers item C) ---

	// ReconcilerActionsTotal counts actions the reconciliation worker
	// has taken, labelled by phase. Lets dashboards show "reconciler
	// fixed N stuck orders this week" vs "orphan refunds auto-failed
	// this week" without re-parsing logs.
	ReconcilerActionsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
		Name: "veza_reconciler_actions_total",
		Help: "Reconciler actions taken, by phase (stuck_orders|stuck_refunds|orphan_refunds).",
	}, []string{"phase"})

	// ReconcilerOrphanRefundsTotal is the load-bearing counter for
	// detecting two-phase-commit bugs. Each orphan refund the
	// reconciler auto-fails increments this; a sustained non-zero
	// rate is the ledger-health equivalent of a fire alarm.
	ReconcilerOrphanRefundsTotal = promauto.NewCounter(prometheus.CounterOpts{
		Name: "veza_reconciler_orphan_refunds_total",
		Help: "Orphan refunds (Phase 2 crash) auto-failed by the reconciler. Non-zero rate = investigate root cause.",
	})

	// ReconcilerSweepDurationSeconds measures one RunOnce tick so
	// slow sweeps show up in alerting.
	ReconcilerSweepDurationSeconds = promauto.NewHistogram(prometheus.HistogramOpts{
		Name:    "veza_reconciler_sweep_duration_seconds",
		Help:    "Duration of one ReconcileHyperswitchWorker.RunOnce tick in seconds.",
		Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), // 0.1s to ~100s
	})

	// ReconcilerLastRunTimestamp is set to time.Now().Unix() at the
	// end of every RunOnce. Alert rule: `now() - timestamp >
	// 2 * RECONCILE_INTERVAL` → worker is dead.
	ReconcilerLastRunTimestamp = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "veza_reconciler_last_run_timestamp",
		Help: "Unix timestamp of the last successful ReconcileHyperswitchWorker tick. Stale = worker dead.",
	})
)

// Convenience recorders for the reconciler worker. Called from
// internal/core/marketplace/reconcile_hyperswitch.go so that package
// doesn't import Prometheus directly (keeps marketplace clean of
// observability plumbing).
func RecordReconcilerAction(phase string) {
	ReconcilerActionsTotal.WithLabelValues(phase).Inc()
}

func RecordReconcilerOrphanRefund() {
	ReconcilerOrphanRefundsTotal.Inc()
}

func RecordReconcilerSweepDuration(d time.Duration) {
	ReconcilerSweepDurationSeconds.Observe(d.Seconds())
	ReconcilerLastRunTimestamp.Set(float64(time.Now().Unix()))
}

// --- Sampler ---

// LedgerSamplerInterval is how often the sampler re-queries the DB.
// 60s is the sweet spot for our volumes — scrape cost negligible, and
// stale-by-up-to-a-minute is fine for the alert-rule windows (10m /
// 5m).
const LedgerSamplerInterval = 60 * time.Second

// Staleness thresholds match the reconciler's defaults. If ops tunes
// the reconciler thresholds via env vars, the sampler still reports
// against these constants — intentional: the two serve different
// audiences (reconciler = auto-recovery, sampler = human visibility).
// A mismatch means alerts fire while the reconciler has already
// started working on the issue, which is the correct behavior.
const (
	ledgerStuckOrderAgeThreshold  = 30 * time.Minute
	ledgerStuckRefundAgeThreshold = 30 * time.Minute
	ledgerOrphanRefundAgeThreshold = 5 * time.Minute
	ledgerReversalPendingThreshold = 30 * time.Minute
)

// SampleLedgerHealth runs the five count queries and updates the
// gauges. Safe to call concurrently (gauge writes are atomic). Any
// query error sets the corresponding gauge to -1 — a distinctive
// value that dashboards can filter on ("sampler is broken, don't
// trust the number").
//
// Exposed as a function rather than a method so tests can drive it
// directly against a sqlite in-memory DB.
func SampleLedgerHealth(ctx context.Context, db *gorm.DB, logger *zap.Logger) {
	now := time.Now()

	sample := func(name string, gauge prometheus.Gauge, query string, args ...interface{}) {
		var count int64
		if err := db.WithContext(ctx).Raw(query, args...).Scan(&count).Error; err != nil {
			logger.Error("ledger sampler: query failed",
				zap.String("gauge", name),
				zap.Error(err))
			gauge.Set(-1)
			return
		}
		gauge.Set(float64(count))
	}

	sample("stuck_orders_pending", LedgerStuckOrdersPending,
		`SELECT COUNT(*) FROM orders
		 WHERE status = 'pending'
		   AND hyperswitch_payment_id IS NOT NULL AND hyperswitch_payment_id <> ''
		   AND created_at < ?`,
		now.Add(-ledgerStuckOrderAgeThreshold))

	sample("stuck_refunds_pending", LedgerStuckRefundsPending,
		`SELECT COUNT(*) FROM refunds
		 WHERE status = 'pending'
		   AND hyperswitch_refund_id IS NOT NULL AND hyperswitch_refund_id <> ''
		   AND created_at < ?`,
		now.Add(-ledgerStuckRefundAgeThreshold))

	sample("failed_transfers_at_max_retry", LedgerFailedTransfersAtMaxRetry,
		`SELECT COUNT(*) FROM seller_transfers WHERE status = 'permanently_failed'`)

	sample("orphan_refund_rows", LedgerOrphanRefundRows,
		`SELECT COUNT(*) FROM refunds
		 WHERE status = 'pending'
		   AND (hyperswitch_refund_id IS NULL OR hyperswitch_refund_id = '')
		   AND created_at < ?`,
		now.Add(-ledgerOrphanRefundAgeThreshold))

	sample("reversal_pending_transfers", LedgerReversalPendingTransfers,
		`SELECT COUNT(*) FROM seller_transfers
		 WHERE status = 'reversal_pending'
		   AND updated_at < ?`,
		now.Add(-ledgerReversalPendingThreshold))
}

// ScheduleLedgerHealthSampler runs SampleLedgerHealth once at startup
// (so dashboards aren't blank for the first minute) and then every
// LedgerSamplerInterval until ctx is cancelled.
func ScheduleLedgerHealthSampler(ctx context.Context, db *gorm.DB, logger *zap.Logger) {
	ticker := time.NewTicker(LedgerSamplerInterval)
	go func() {
		defer ticker.Stop()
		SampleLedgerHealth(ctx, db, logger)
		for {
			select {
			case <-ctx.Done():
				logger.Info("Ledger health sampler stopped")
				return
			case <-ticker.C:
				SampleLedgerHealth(ctx, db, logger)
			}
		}
	}()
	logger.Info("Ledger health sampler scheduled",
		zap.Duration("interval", LedgerSamplerInterval))
}
feat(metrics): ledger-health gauges + alert rules — v1.0.7 item F Five Prometheus gauges + reconciler metrics + Grafana dashboard + three alert rules. Closes axis-1 P1.8 and adds observability for item C's reconciler (user review: "F should include reconciler_* metrics, otherwise tag is blind on the worker we just shipped"). Gauges (veza_ledger_, sampled every 60s): * orphan_refund_rows — THE canary. Pending refunds with empty hyperswitch_refund_id older than 5m = Phase 2 crash in RefundOrder. Alert: > 0 for 5m → page. * stuck_orders_pending — order pending > 30m with non-empty payment_id. Alert: > 0 for 10m → page. * stuck_refunds_pending — refund pending > 30m with hs_id. * failed_transfers_at_max_retry — permanently_failed rows. * reversal_pending_transfers — item B rows stuck > 30m. Reconciler metrics (veza_reconciler_): * actions_total{phase} — counter by phase. * orphan_refunds_total — two-phase-bug canary. * sweep_duration_seconds — exponential histogram. * last_run_timestamp — alert: stale > 2h → page (worker dead). Implementation notes: * Sampler thresholds hardcoded to match reconciler defaults — intentional mismatch allowed (alerts fire while reconciler already working = correct behavior). * Query error sets gauge to -1 (sentinel for "sampler broken"). * marketplace package routes through monitoring recorders so it doesn't import prometheus directly. * Sampler runs regardless of Hyperswitch enablement; gauges default 0 when pipeline idle. * Graceful shutdown wired in cmd/api/main.go. Alert rules in config/alertmanager/ledger.yml with runbook pointers + detailed descriptions — each alert explains WHAT happened, WHY the reconciler may not resolve it, and WHERE to look first. Grafana dashboard config/grafana/dashboards/ledger-health.json — top row = 5 stat panels (orphan first, color-coded red on > 0), middle row = trend timeseries + reconciler action rate by phase, bottom row = sweep duration p50/p95/p99 + seconds-since-last-tick + orphan cumulative. Tests — 6 cases, all green (sqlite :memory:): * CountsStuckOrdersPending (includes the filter on non-empty payment_id) * StuckOrdersZeroWhenAllCompleted * CountsOrphanRefunds (THE canary) * CountsStuckRefundsWithHsID (gauge-orthogonality check) * CountsFailedAndReversalPendingTransfers * ReconcilerRecorders (counter + gauge shape) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 01:40:14 +00:00			`package monitoring`

			`import (`
			`"context"`
			`"time"`

			`"github.com/prometheus/client_golang/prometheus"`
			`"github.com/prometheus/client_golang/prometheus/promauto"`
			`"go.uber.org/zap"`
			`"gorm.io/gorm"`
			`)`

			`// Ledger-health metrics (v1.0.7 item F, audit P1.8).`
			`//`
			`// Five gauges expose stuck-state counts so ops dashboards + alert`
			`// rules can spot a money-movement pipeline stall before a customer`
			`// does. Sampled every LedgerSamplerInterval via a 60s ticker that`
			`// runs a cheap indexed SELECT COUNT(*) per gauge. The query cost is`
			// bounded: each clause filters on `status + created_at`, both
			`// indexed on the relevant tables.`
			`//`
			`// Paired with reconciler_* counters so the dashboard can tell the`
			`// whole story at a glance: "we have N stuck orders and the`
			`// reconciler has resolved M of them today."`
			`//`
			`// Plus two alert rules in config/alertmanager/ledger.yml:`
			`// * ledger_stuck_orders_pending > 0 for 10m → page`
			`// * ledger_orphan_refund_rows > 0 for 5m → page (bug in two-phase`
			`// commit between DB and PSP — immediate ops attention)`
			`var (`
			`// LedgerStuckOrdersPending is the count of orders sitting in`
			// `pending` past the staleness threshold (30m by default). Should
			`// be 0 in steady state; non-zero means webhooks from the PSP`
			`// stopped arriving or our endpoint is rejecting them.`
			`LedgerStuckOrdersPending = promauto.NewGauge(prometheus.GaugeOpts{`
			`Name: "veza_ledger_stuck_orders_pending",`
			`Help: "Orders in 'pending' status older than the staleness threshold (30m). Non-zero triggers ops alert.",`
			`})`

			`// LedgerStuckRefundsPending is the count of refunds with a PSP id`
			// but still in `pending` past the threshold. Symptom: the refund
			`// was accepted by Hyperswitch but our webhook handler never`
			`// received the terminal event.`
			`LedgerStuckRefundsPending = promauto.NewGauge(prometheus.GaugeOpts{`
			`Name: "veza_ledger_stuck_refunds_pending",`
			`Help: "Refunds with hyperswitch_refund_id set but status still 'pending' older than 30m.",`
			`})`

			`// LedgerFailedTransfersAtMaxRetry is the count of seller_transfers`
			`// that exhausted the retry worker's attempts. Non-zero = ops`
			`// investigation required; the Stripe Connect side is stuck`
			`// somehow.`
			`LedgerFailedTransfersAtMaxRetry = promauto.NewGauge(prometheus.GaugeOpts{`
			`Name: "veza_ledger_failed_transfers_at_max_retry",`
			`Help: "seller_transfers with status='permanently_failed' — retry worker gave up.",`
			`})`

			`// LedgerOrphanRefundRows is THE alert gauge. Non-zero means a`
			`// Refund row exists in 'pending' with no hyperswitch_refund_id`
			`// (past 5m) — i.e., Phase 1 ran, Phase 2 crashed. This is a`
			`// two-phase-commit bug. Page on > 0 for 5m.`
			`LedgerOrphanRefundRows = promauto.NewGauge(prometheus.GaugeOpts{`
			`Name: "veza_ledger_orphan_refund_rows",`
			`Help: "Refunds pending with empty hyperswitch_refund_id older than 5m. Non-zero = crash between Phase 1 and Phase 2 of RefundOrder, page ops immediately.",`
			`})`

			`// LedgerReversalPendingTransfers tracks rows waiting for the`
			`// Stripe reversal worker (v1.0.7 item B). Non-zero during a`
			`// reversal is normal and transient; sustained > 0 means the`
			`// reversal worker is stuck or Stripe Connect is down.`
			`LedgerReversalPendingTransfers = promauto.NewGauge(prometheus.GaugeOpts{`
			`Name: "veza_ledger_reversal_pending_transfers",`
			`Help: "seller_transfers in 'reversal_pending' older than 30m — reversal worker is behind.",`
			`})`

			`// --- Reconciler metrics (v1.0.7 item F, covers item C) ---`

			`// ReconcilerActionsTotal counts actions the reconciliation worker`
			`// has taken, labelled by phase. Lets dashboards show "reconciler`
			`// fixed N stuck orders this week" vs "orphan refunds auto-failed`
			`// this week" without re-parsing logs.`
			`ReconcilerActionsTotal = promauto.NewCounterVec(prometheus.CounterOpts{`
			`Name: "veza_reconciler_actions_total",`
			`Help: "Reconciler actions taken, by phase (stuck_orders\|stuck_refunds\|orphan_refunds).",`
			`}, []string{"phase"})`

			`// ReconcilerOrphanRefundsTotal is the load-bearing counter for`
			`// detecting two-phase-commit bugs. Each orphan refund the`
			`// reconciler auto-fails increments this; a sustained non-zero`
			`// rate is the ledger-health equivalent of a fire alarm.`
			`ReconcilerOrphanRefundsTotal = promauto.NewCounter(prometheus.CounterOpts{`
			`Name: "veza_reconciler_orphan_refunds_total",`
			`Help: "Orphan refunds (Phase 2 crash) auto-failed by the reconciler. Non-zero rate = investigate root cause.",`
			`})`

			`// ReconcilerSweepDurationSeconds measures one RunOnce tick so`
			`// slow sweeps show up in alerting.`
			`ReconcilerSweepDurationSeconds = promauto.NewHistogram(prometheus.HistogramOpts{`
			`Name: "veza_reconciler_sweep_duration_seconds",`
			`Help: "Duration of one ReconcileHyperswitchWorker.RunOnce tick in seconds.",`
			`Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), // 0.1s to ~100s`
			`})`

			`// ReconcilerLastRunTimestamp is set to time.Now().Unix() at the`
			// end of every RunOnce. Alert rule: `now() - timestamp >
			// 2 * RECONCILE_INTERVAL` → worker is dead.
			`ReconcilerLastRunTimestamp = promauto.NewGauge(prometheus.GaugeOpts{`
			`Name: "veza_reconciler_last_run_timestamp",`
			`Help: "Unix timestamp of the last successful ReconcileHyperswitchWorker tick. Stale = worker dead.",`
			`})`
			`)`

			`// Convenience recorders for the reconciler worker. Called from`
			`// internal/core/marketplace/reconcile_hyperswitch.go so that package`
			`// doesn't import Prometheus directly (keeps marketplace clean of`
			`// observability plumbing).`
			`func RecordReconcilerAction(phase string) {`
			`ReconcilerActionsTotal.WithLabelValues(phase).Inc()`
			`}`

			`func RecordReconcilerOrphanRefund() {`
			`ReconcilerOrphanRefundsTotal.Inc()`
			`}`

			`func RecordReconcilerSweepDuration(d time.Duration) {`
			`ReconcilerSweepDurationSeconds.Observe(d.Seconds())`
			`ReconcilerLastRunTimestamp.Set(float64(time.Now().Unix()))`
			`}`

			`// --- Sampler ---`

			`// LedgerSamplerInterval is how often the sampler re-queries the DB.`
			`// 60s is the sweet spot for our volumes — scrape cost negligible, and`
			`// stale-by-up-to-a-minute is fine for the alert-rule windows (10m /`
			`// 5m).`
			`const LedgerSamplerInterval = 60 * time.Second`

			`// Staleness thresholds match the reconciler's defaults. If ops tunes`
			`// the reconciler thresholds via env vars, the sampler still reports`
			`// against these constants — intentional: the two serve different`
			`// audiences (reconciler = auto-recovery, sampler = human visibility).`
			`// A mismatch means alerts fire while the reconciler has already`
			`// started working on the issue, which is the correct behavior.`
			`const (`
			`ledgerStuckOrderAgeThreshold = 30 * time.Minute`
			`ledgerStuckRefundAgeThreshold = 30 * time.Minute`
			`ledgerOrphanRefundAgeThreshold = 5 * time.Minute`
			`ledgerReversalPendingThreshold = 30 * time.Minute`
			`)`

			`// SampleLedgerHealth runs the five count queries and updates the`
			`// gauges. Safe to call concurrently (gauge writes are atomic). Any`
			`// query error sets the corresponding gauge to -1 — a distinctive`
			`// value that dashboards can filter on ("sampler is broken, don't`
			`// trust the number").`
			`//`
			`// Exposed as a function rather than a method so tests can drive it`
			`// directly against a sqlite in-memory DB.`
			`func SampleLedgerHealth(ctx context.Context, db gorm.DB, logger zap.Logger) {`
			`now := time.Now()`

			`sample := func(name string, gauge prometheus.Gauge, query string, args ...interface{}) {`
			`var count int64`
			`if err := db.WithContext(ctx).Raw(query, args...).Scan(&count).Error; err != nil {`
			`logger.Error("ledger sampler: query failed",`
			`zap.String("gauge", name),`
			`zap.Error(err))`
			`gauge.Set(-1)`
			`return`
			`}`
			`gauge.Set(float64(count))`
			`}`

			`sample("stuck_orders_pending", LedgerStuckOrdersPending,`
			`SELECT COUNT(*) FROM orders
			`WHERE status = 'pending'`
			`AND hyperswitch_payment_id IS NOT NULL AND hyperswitch_payment_id <> ''`
			AND created_at < ?`,
			`now.Add(-ledgerStuckOrderAgeThreshold))`

			`sample("stuck_refunds_pending", LedgerStuckRefundsPending,`
			`SELECT COUNT(*) FROM refunds
			`WHERE status = 'pending'`
			`AND hyperswitch_refund_id IS NOT NULL AND hyperswitch_refund_id <> ''`
			AND created_at < ?`,
			`now.Add(-ledgerStuckRefundAgeThreshold))`

			`sample("failed_transfers_at_max_retry", LedgerFailedTransfersAtMaxRetry,`
			`SELECT COUNT(*) FROM seller_transfers WHERE status = 'permanently_failed'`)

			`sample("orphan_refund_rows", LedgerOrphanRefundRows,`
			`SELECT COUNT(*) FROM refunds
			`WHERE status = 'pending'`
			`AND (hyperswitch_refund_id IS NULL OR hyperswitch_refund_id = '')`
			AND created_at < ?`,
			`now.Add(-ledgerOrphanRefundAgeThreshold))`

			`sample("reversal_pending_transfers", LedgerReversalPendingTransfers,`
			`SELECT COUNT(*) FROM seller_transfers
			`WHERE status = 'reversal_pending'`
			AND updated_at < ?`,
			`now.Add(-ledgerReversalPendingThreshold))`
			`}`

			`// ScheduleLedgerHealthSampler runs SampleLedgerHealth once at startup`
			`// (so dashboards aren't blank for the first minute) and then every`
			`// LedgerSamplerInterval until ctx is cancelled.`
			`func ScheduleLedgerHealthSampler(ctx context.Context, db gorm.DB, logger zap.Logger) {`
			`ticker := time.NewTicker(LedgerSamplerInterval)`
			`go func() {`
			`defer ticker.Stop()`
			`SampleLedgerHealth(ctx, db, logger)`
			`for {`
			`select {`
			`case <-ctx.Done():`
			`logger.Info("Ledger health sampler stopped")`
			`return`
			`case <-ticker.C:`
			`SampleLedgerHealth(ctx, db, logger)`
			`}`
			`}`
			`}()`
			`logger.Info("Ledger health sampler scheduled",`
			`zap.Duration("interval", LedgerSamplerInterval))`
			`}`