package monitoring import ( "context" "testing" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" "gorm.io/driver/sqlite" "gorm.io/gorm" ) // --- test DB helpers -------------------------------------------------------- // minimal table schemas: we don't import marketplace here to avoid // circular deps (monitoring is a leaf); we create just what the // sampler queries. type testOrder struct { ID uuid.UUID `gorm:"type:uuid;primaryKey"` Status string HyperswitchPaymentID string CreatedAt time.Time } func (testOrder) TableName() string { return "orders" } type testRefund struct { ID uuid.UUID `gorm:"type:uuid;primaryKey"` Status string HyperswitchRefundID string CreatedAt time.Time } func (testRefund) TableName() string { return "refunds" } type testSellerTransfer struct { ID uuid.UUID `gorm:"type:uuid;primaryKey"` Status string UpdatedAt time.Time } func (testSellerTransfer) TableName() string { return "seller_transfers" } func setupSamplerTestDB(t *testing.T) *gorm.DB { t.Helper() db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{}) require.NoError(t, err) require.NoError(t, db.AutoMigrate(&testOrder{}, &testRefund{}, &testSellerTransfer{})) return db } // gaugeValue reads the raw value of a Prometheus gauge for assertion. func gaugeValue(t *testing.T, g prometheus.Gauge) float64 { t.Helper() m := &dto.Metric{} require.NoError(t, g.Write(m)) return m.GetGauge().GetValue() } // --- stuck_orders_pending gauge --------------------------------------------- func TestSampler_CountsStuckOrdersPending(t *testing.T) { db := setupSamplerTestDB(t) now := time.Now() // 2 stuck (over 30m old) + 1 recent (under 30m) + 1 stuck but no payment_id. require.NoError(t, db.Create(&testOrder{ ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "pay_1", CreatedAt: now.Add(-1 * time.Hour), }).Error) require.NoError(t, db.Create(&testOrder{ ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "pay_2", CreatedAt: now.Add(-45 * time.Minute), }).Error) require.NoError(t, db.Create(&testOrder{ ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "pay_recent", CreatedAt: now.Add(-5 * time.Minute), }).Error) // Stuck BUT no payment_id → excluded (pre-PSP, not the gauge's concern). require.NoError(t, db.Create(&testOrder{ ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "", CreatedAt: now.Add(-1 * time.Hour), }).Error) SampleLedgerHealth(context.Background(), db, zap.NewNop()) assert.Equal(t, 2.0, gaugeValue(t, LedgerStuckOrdersPending), "gauge must count only rows pending > 30m with non-empty payment_id") } func TestSampler_StuckOrdersZeroWhenAllCompleted(t *testing.T) { db := setupSamplerTestDB(t) require.NoError(t, db.Create(&testOrder{ ID: uuid.New(), Status: "completed", HyperswitchPaymentID: "pay_done", CreatedAt: time.Now().Add(-1 * time.Hour), }).Error) SampleLedgerHealth(context.Background(), db, zap.NewNop()) assert.Equal(t, 0.0, gaugeValue(t, LedgerStuckOrdersPending)) } // --- orphan_refund_rows gauge (THE alert gauge) ----------------------------- func TestSampler_CountsOrphanRefunds(t *testing.T) { db := setupSamplerTestDB(t) now := time.Now() // Three orphans (empty hs_id, >5m old) + one recent orphan (should not count) // + one with an hs_id (should not count). for i := 0; i < 3; i++ { require.NoError(t, db.Create(&testRefund{ ID: uuid.New(), Status: "pending", HyperswitchRefundID: "", CreatedAt: now.Add(-10 * time.Minute), }).Error) } require.NoError(t, db.Create(&testRefund{ ID: uuid.New(), Status: "pending", HyperswitchRefundID: "", CreatedAt: now.Add(-2 * time.Minute), // too recent }).Error) require.NoError(t, db.Create(&testRefund{ ID: uuid.New(), Status: "pending", HyperswitchRefundID: "ref_ok", CreatedAt: now.Add(-10 * time.Minute), // has hs_id → not orphan }).Error) SampleLedgerHealth(context.Background(), db, zap.NewNop()) assert.Equal(t, 3.0, gaugeValue(t, LedgerOrphanRefundRows), "orphan gauge counts pending refunds with empty hs_id older than 5m — the two-phase-commit-bug canary") } // --- stuck_refunds_pending gauge -------------------------------------------- func TestSampler_CountsStuckRefundsWithHsID(t *testing.T) { db := setupSamplerTestDB(t) now := time.Now() require.NoError(t, db.Create(&testRefund{ ID: uuid.New(), Status: "pending", HyperswitchRefundID: "ref_stuck", CreatedAt: now.Add(-45 * time.Minute), }).Error) // Orphan, counted by a different gauge, not this one: require.NoError(t, db.Create(&testRefund{ ID: uuid.New(), Status: "pending", HyperswitchRefundID: "", CreatedAt: now.Add(-45 * time.Minute), }).Error) SampleLedgerHealth(context.Background(), db, zap.NewNop()) assert.Equal(t, 1.0, gaugeValue(t, LedgerStuckRefundsPending)) // And orphan gauge catches the other one. assert.Equal(t, 1.0, gaugeValue(t, LedgerOrphanRefundRows)) } // --- permanently_failed transfers + reversal_pending ------------------------ func TestSampler_CountsFailedAndReversalPendingTransfers(t *testing.T) { db := setupSamplerTestDB(t) // 2 permanently_failed, 1 completed (ignored by failed gauge), // 1 reversal_pending updated 45m ago, 1 reversal_pending recent (ignored). require.NoError(t, db.Create(&testSellerTransfer{ ID: uuid.New(), Status: "permanently_failed", UpdatedAt: time.Now(), }).Error) require.NoError(t, db.Create(&testSellerTransfer{ ID: uuid.New(), Status: "permanently_failed", UpdatedAt: time.Now().Add(-1 * time.Hour), }).Error) require.NoError(t, db.Create(&testSellerTransfer{ ID: uuid.New(), Status: "completed", UpdatedAt: time.Now(), }).Error) require.NoError(t, db.Create(&testSellerTransfer{ ID: uuid.New(), Status: "reversal_pending", UpdatedAt: time.Now().Add(-45 * time.Minute), }).Error) require.NoError(t, db.Create(&testSellerTransfer{ ID: uuid.New(), Status: "reversal_pending", UpdatedAt: time.Now().Add(-5 * time.Minute), }).Error) SampleLedgerHealth(context.Background(), db, zap.NewNop()) assert.Equal(t, 2.0, gaugeValue(t, LedgerFailedTransfersAtMaxRetry)) assert.Equal(t, 1.0, gaugeValue(t, LedgerReversalPendingTransfers), "reversal_pending counts only rows that have been stuck past the threshold") } // --- reconciler counters are writable + readable ---------------------------- func TestReconcilerRecorders(t *testing.T) { // Capture starting values so parallel test runs don't assume a // clean slate (Prometheus registries are global). actionsStart := func() float64 { m := &dto.Metric{} require.NoError(t, ReconcilerActionsTotal.WithLabelValues("stuck_orders").Write(m)) return m.GetCounter().GetValue() }() orphanStart := func() float64 { m := &dto.Metric{} require.NoError(t, ReconcilerOrphanRefundsTotal.Write(m)) return m.GetCounter().GetValue() }() RecordReconcilerAction("stuck_orders") RecordReconcilerAction("stuck_orders") RecordReconcilerOrphanRefund() RecordReconcilerSweepDuration(500 * time.Millisecond) actionsAfter := func() float64 { m := &dto.Metric{} require.NoError(t, ReconcilerActionsTotal.WithLabelValues("stuck_orders").Write(m)) return m.GetCounter().GetValue() }() orphanAfter := func() float64 { m := &dto.Metric{} require.NoError(t, ReconcilerOrphanRefundsTotal.Write(m)) return m.GetCounter().GetValue() }() lastRun := gaugeValue(t, ReconcilerLastRunTimestamp) assert.Equal(t, 2.0, actionsAfter-actionsStart) assert.Equal(t, 1.0, orphanAfter-orphanStart) assert.Greater(t, lastRun, float64(0), "RecordReconcilerSweepDuration must stamp last-run timestamp") }