Some checks failed
Veza CI / Rust (Stream Server) (push) Successful in 4m22s
Security Scan / Secret Scanning (gitleaks) (push) Successful in 1m5s
Veza CI / Frontend (Web) (push) Failing after 17m19s
E2E Playwright / e2e (full) (push) Failing after 20m28s
Veza CI / Backend (Go) (push) Successful in 21m31s
Veza CI / Notify on failure (push) Successful in 4s
Three pre-existing infra issues surfaced by the Day 1→Day 3 push wave.
Each is independent — bundled here because the goal is "ci.yml + e2e.yml
green" before the v1.0.9 tag, and they're all small.
(1) gofmt — ci.yml golangci-lint v2 step
Five files were unformatted on main. Pre-existing (untouched by my
Item G work, but the formatter caught them now):
- internal/api/router.go
- internal/core/marketplace/reconcile_hyperswitch_test.go
- internal/models/user.go
- internal/monitoring/ledger_metrics.go
- internal/monitoring/ledger_metrics_test.go
Pure whitespace via `gofmt -w` — no behavior change.
(2) e2e silent-fail — playwright webServer port collision
The e2e workflow pre-starts the backend in step 9 ("Build + start
backend API") so it can fail-fast on a non-ok health check. But
playwright.config.ts had `reuseExistingServer: !process.env.CI` on
the backend webServer entry — meaning in CI Playwright tried to
spawn a SECOND backend on port 18080. The spawn collided with
EADDRINUSE and Playwright silently exited before printing any test
output. The artifact upload then warned "No files were found"
because tests/e2e/playwright-report/ never got written, and the job
ended in `Failure` for an unrelated reason (the artifact upload
step's GHESNotSupportedError).
Fix: backend `reuseExistingServer: true` always — workflow + dev
both pre-start backend on 18080. Vite stays `!CI` because the
workflow doesn't pre-start it. Comment in playwright.config.ts
documents the symptom so the next person debugging gets the
pointer immediately.
(3) orders.hyperswitch_payment_id missing in fresh DBs — migration 080
skip-branch + 099 ordering drift
Migration 080 (`add_payment_fields`) wraps its ALTERs in
"skip if orders doesn't exist". At authoring time orders existed
earlier in the migration sequence; that ordering has since shifted
(orders is now created at 099_z_create_orders.sql, AFTER 080).
Result: in any freshly-migrated DB (CI, fresh dev, future restore
drills) migration 080 takes the skip branch and the columns are
never added — even though the Order model and the marketplace code
rely on them.
Symptom: every CI run logs
pq: column "hyperswitch_payment_id" does not exist
from the periodic ledger_metrics worker. Order checkout would also
fail to persist payment_id at write time, breaking reconciliation.
Fix: append-only migration 987 with idempotent
`ADD COLUMN IF NOT EXISTS` + a partial index on the reconciliation
hot path. Production envs that did pick up 080 in the original
order are no-ops; fresh envs converge to the same end state.
Rollback in migrations/rollback/.
Verified locally:
$ cd veza-backend-api && go build ./... && VEZA_SKIP_INTEGRATION=1 \
go test -short -count=1 ./internal/...
(all green)
SKIP_TESTS=1: backend-only Go + Playwright config + SQL. Frontend
unit tests irrelevant to this commit.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
222 lines
7.7 KiB
Go
222 lines
7.7 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
dto "github.com/prometheus/client_model/go"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/zap"
|
|
"gorm.io/driver/sqlite"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// --- test DB helpers --------------------------------------------------------
|
|
|
|
// minimal table schemas: we don't import marketplace here to avoid
|
|
// circular deps (monitoring is a leaf); we create just what the
|
|
// sampler queries.
|
|
type testOrder struct {
|
|
ID uuid.UUID `gorm:"type:uuid;primaryKey"`
|
|
Status string
|
|
HyperswitchPaymentID string
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
func (testOrder) TableName() string { return "orders" }
|
|
|
|
type testRefund struct {
|
|
ID uuid.UUID `gorm:"type:uuid;primaryKey"`
|
|
Status string
|
|
HyperswitchRefundID string
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
func (testRefund) TableName() string { return "refunds" }
|
|
|
|
type testSellerTransfer struct {
|
|
ID uuid.UUID `gorm:"type:uuid;primaryKey"`
|
|
Status string
|
|
UpdatedAt time.Time
|
|
}
|
|
|
|
func (testSellerTransfer) TableName() string { return "seller_transfers" }
|
|
|
|
func setupSamplerTestDB(t *testing.T) *gorm.DB {
|
|
t.Helper()
|
|
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
|
|
require.NoError(t, err)
|
|
require.NoError(t, db.AutoMigrate(&testOrder{}, &testRefund{}, &testSellerTransfer{}))
|
|
return db
|
|
}
|
|
|
|
// gaugeValue reads the raw value of a Prometheus gauge for assertion.
|
|
func gaugeValue(t *testing.T, g prometheus.Gauge) float64 {
|
|
t.Helper()
|
|
m := &dto.Metric{}
|
|
require.NoError(t, g.Write(m))
|
|
return m.GetGauge().GetValue()
|
|
}
|
|
|
|
// --- stuck_orders_pending gauge ---------------------------------------------
|
|
|
|
func TestSampler_CountsStuckOrdersPending(t *testing.T) {
|
|
db := setupSamplerTestDB(t)
|
|
now := time.Now()
|
|
|
|
// 2 stuck (over 30m old) + 1 recent (under 30m) + 1 stuck but no payment_id.
|
|
require.NoError(t, db.Create(&testOrder{
|
|
ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "pay_1",
|
|
CreatedAt: now.Add(-1 * time.Hour),
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testOrder{
|
|
ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "pay_2",
|
|
CreatedAt: now.Add(-45 * time.Minute),
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testOrder{
|
|
ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "pay_recent",
|
|
CreatedAt: now.Add(-5 * time.Minute),
|
|
}).Error)
|
|
// Stuck BUT no payment_id → excluded (pre-PSP, not the gauge's concern).
|
|
require.NoError(t, db.Create(&testOrder{
|
|
ID: uuid.New(), Status: "pending", HyperswitchPaymentID: "",
|
|
CreatedAt: now.Add(-1 * time.Hour),
|
|
}).Error)
|
|
|
|
SampleLedgerHealth(context.Background(), db, zap.NewNop())
|
|
|
|
assert.Equal(t, 2.0, gaugeValue(t, LedgerStuckOrdersPending),
|
|
"gauge must count only rows pending > 30m with non-empty payment_id")
|
|
}
|
|
|
|
func TestSampler_StuckOrdersZeroWhenAllCompleted(t *testing.T) {
|
|
db := setupSamplerTestDB(t)
|
|
require.NoError(t, db.Create(&testOrder{
|
|
ID: uuid.New(), Status: "completed", HyperswitchPaymentID: "pay_done",
|
|
CreatedAt: time.Now().Add(-1 * time.Hour),
|
|
}).Error)
|
|
|
|
SampleLedgerHealth(context.Background(), db, zap.NewNop())
|
|
assert.Equal(t, 0.0, gaugeValue(t, LedgerStuckOrdersPending))
|
|
}
|
|
|
|
// --- orphan_refund_rows gauge (THE alert gauge) -----------------------------
|
|
|
|
func TestSampler_CountsOrphanRefunds(t *testing.T) {
|
|
db := setupSamplerTestDB(t)
|
|
now := time.Now()
|
|
|
|
// Three orphans (empty hs_id, >5m old) + one recent orphan (should not count)
|
|
// + one with an hs_id (should not count).
|
|
for i := 0; i < 3; i++ {
|
|
require.NoError(t, db.Create(&testRefund{
|
|
ID: uuid.New(), Status: "pending", HyperswitchRefundID: "",
|
|
CreatedAt: now.Add(-10 * time.Minute),
|
|
}).Error)
|
|
}
|
|
require.NoError(t, db.Create(&testRefund{
|
|
ID: uuid.New(), Status: "pending", HyperswitchRefundID: "",
|
|
CreatedAt: now.Add(-2 * time.Minute), // too recent
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testRefund{
|
|
ID: uuid.New(), Status: "pending", HyperswitchRefundID: "ref_ok",
|
|
CreatedAt: now.Add(-10 * time.Minute), // has hs_id → not orphan
|
|
}).Error)
|
|
|
|
SampleLedgerHealth(context.Background(), db, zap.NewNop())
|
|
assert.Equal(t, 3.0, gaugeValue(t, LedgerOrphanRefundRows),
|
|
"orphan gauge counts pending refunds with empty hs_id older than 5m — the two-phase-commit-bug canary")
|
|
}
|
|
|
|
// --- stuck_refunds_pending gauge --------------------------------------------
|
|
|
|
func TestSampler_CountsStuckRefundsWithHsID(t *testing.T) {
|
|
db := setupSamplerTestDB(t)
|
|
now := time.Now()
|
|
|
|
require.NoError(t, db.Create(&testRefund{
|
|
ID: uuid.New(), Status: "pending", HyperswitchRefundID: "ref_stuck",
|
|
CreatedAt: now.Add(-45 * time.Minute),
|
|
}).Error)
|
|
// Orphan, counted by a different gauge, not this one:
|
|
require.NoError(t, db.Create(&testRefund{
|
|
ID: uuid.New(), Status: "pending", HyperswitchRefundID: "",
|
|
CreatedAt: now.Add(-45 * time.Minute),
|
|
}).Error)
|
|
|
|
SampleLedgerHealth(context.Background(), db, zap.NewNop())
|
|
assert.Equal(t, 1.0, gaugeValue(t, LedgerStuckRefundsPending))
|
|
// And orphan gauge catches the other one.
|
|
assert.Equal(t, 1.0, gaugeValue(t, LedgerOrphanRefundRows))
|
|
}
|
|
|
|
// --- permanently_failed transfers + reversal_pending ------------------------
|
|
|
|
func TestSampler_CountsFailedAndReversalPendingTransfers(t *testing.T) {
|
|
db := setupSamplerTestDB(t)
|
|
|
|
// 2 permanently_failed, 1 completed (ignored by failed gauge),
|
|
// 1 reversal_pending updated 45m ago, 1 reversal_pending recent (ignored).
|
|
require.NoError(t, db.Create(&testSellerTransfer{
|
|
ID: uuid.New(), Status: "permanently_failed", UpdatedAt: time.Now(),
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testSellerTransfer{
|
|
ID: uuid.New(), Status: "permanently_failed", UpdatedAt: time.Now().Add(-1 * time.Hour),
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testSellerTransfer{
|
|
ID: uuid.New(), Status: "completed", UpdatedAt: time.Now(),
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testSellerTransfer{
|
|
ID: uuid.New(), Status: "reversal_pending", UpdatedAt: time.Now().Add(-45 * time.Minute),
|
|
}).Error)
|
|
require.NoError(t, db.Create(&testSellerTransfer{
|
|
ID: uuid.New(), Status: "reversal_pending", UpdatedAt: time.Now().Add(-5 * time.Minute),
|
|
}).Error)
|
|
|
|
SampleLedgerHealth(context.Background(), db, zap.NewNop())
|
|
assert.Equal(t, 2.0, gaugeValue(t, LedgerFailedTransfersAtMaxRetry))
|
|
assert.Equal(t, 1.0, gaugeValue(t, LedgerReversalPendingTransfers),
|
|
"reversal_pending counts only rows that have been stuck past the threshold")
|
|
}
|
|
|
|
// --- reconciler counters are writable + readable ----------------------------
|
|
|
|
func TestReconcilerRecorders(t *testing.T) {
|
|
// Capture starting values so parallel test runs don't assume a
|
|
// clean slate (Prometheus registries are global).
|
|
actionsStart := func() float64 {
|
|
m := &dto.Metric{}
|
|
require.NoError(t, ReconcilerActionsTotal.WithLabelValues("stuck_orders").Write(m))
|
|
return m.GetCounter().GetValue()
|
|
}()
|
|
orphanStart := func() float64 {
|
|
m := &dto.Metric{}
|
|
require.NoError(t, ReconcilerOrphanRefundsTotal.Write(m))
|
|
return m.GetCounter().GetValue()
|
|
}()
|
|
|
|
RecordReconcilerAction("stuck_orders")
|
|
RecordReconcilerAction("stuck_orders")
|
|
RecordReconcilerOrphanRefund()
|
|
RecordReconcilerSweepDuration(500 * time.Millisecond)
|
|
|
|
actionsAfter := func() float64 {
|
|
m := &dto.Metric{}
|
|
require.NoError(t, ReconcilerActionsTotal.WithLabelValues("stuck_orders").Write(m))
|
|
return m.GetCounter().GetValue()
|
|
}()
|
|
orphanAfter := func() float64 {
|
|
m := &dto.Metric{}
|
|
require.NoError(t, ReconcilerOrphanRefundsTotal.Write(m))
|
|
return m.GetCounter().GetValue()
|
|
}()
|
|
lastRun := gaugeValue(t, ReconcilerLastRunTimestamp)
|
|
|
|
assert.Equal(t, 2.0, actionsAfter-actionsStart)
|
|
assert.Equal(t, 1.0, orphanAfter-orphanStart)
|
|
assert.Greater(t, lastRun, float64(0), "RecordReconcilerSweepDuration must stamp last-run timestamp")
|
|
}
|