package monitoring // Real User Monitoring (RUM) — v1.0.10 ops item 9. // // Web Vitals is the standard Google-defined frontend perf signal : // LCP (Largest Contentful Paint), CLS (Cumulative Layout Shift), // FID (First Input Delay — being deprecated), INP (Interaction to // Next Paint — replaces FID), TTFB (Time to First Byte). Each // metric has a published "good / needs improvement / poor" budget ; // p75 across users is the headline number. // // We collect from the browser via the npm `web-vitals` package // (instrumented in apps/web/src/observability/webVitals.ts) and // receive POSTs at /api/v1/observability/web-vitals. The handler // pushes into the histograms below. Synthetic probes already // cover server-side latency ; RUM closes the "user's actual // browser experience" gap (slow CDN edges, third-party scripts, // device CPU, mobile networks). // // Cardinality discipline : labels are bounded — metric name (5 // values), route (capped via the truncation logic in the handler), // device (3 values mobile/desktop/tablet). No user_id, no URL // query string, no full path with IDs. Prometheus tolerates a few // thousand label combinations ; per-user labels would explode it. import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) var ( // WebVitalsLCPSeconds — Largest Contentful Paint, in seconds. // Google budget : <2.5s good, 2.5–4s needs improvement, >4s poor. // Buckets cover the typical range with extra resolution near the // "good" boundary which is what we tune against. WebVitalsLCPSeconds = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "veza_web_vitals_lcp_seconds", Help: "Largest Contentful Paint reported by the browser, in seconds.", Buckets: []float64{0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 6.0, 10.0}, }, []string{"route", "device"}, ) // WebVitalsCLS — Cumulative Layout Shift, unitless. Google // budget : <0.1 good, 0.1–0.25 needs improvement, >0.25 poor. // Histogram (not counter) so we can pull p75 over windows. WebVitalsCLS = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "veza_web_vitals_cls", Help: "Cumulative Layout Shift score reported by the browser (unitless).", Buckets: []float64{0.05, 0.1, 0.15, 0.25, 0.5, 1.0}, }, []string{"route", "device"}, ) // WebVitalsINPSeconds — Interaction to Next Paint, in seconds. // Google budget : <0.2s good, 0.2–0.5s needs improvement, >0.5s // poor. Replaces FID as the primary responsiveness metric in // Core Web Vitals 2024+. WebVitalsINPSeconds = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "veza_web_vitals_inp_seconds", Help: "Interaction to Next Paint reported by the browser, in seconds.", Buckets: []float64{0.05, 0.1, 0.2, 0.3, 0.5, 0.75, 1.0, 2.0}, }, []string{"route", "device"}, ) // WebVitalsFIDSeconds — First Input Delay (legacy, kept for // backwards-compat with old browsers that don't report INP). // Buckets share INP's structure for dashboard symmetry. WebVitalsFIDSeconds = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "veza_web_vitals_fid_seconds", Help: "First Input Delay reported by the browser, in seconds (legacy ; INP is preferred).", Buckets: []float64{0.05, 0.1, 0.2, 0.3, 0.5, 0.75, 1.0, 2.0}, }, []string{"route", "device"}, ) // WebVitalsTTFBSeconds — Time To First Byte, in seconds. Often // dominated by network + edge cache miss ; useful for tracking // CDN-edge degradations (compare with the synthetic probe TTFB). WebVitalsTTFBSeconds = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "veza_web_vitals_ttfb_seconds", Help: "Time to First Byte reported by the browser, in seconds.", Buckets: []float64{0.05, 0.1, 0.2, 0.4, 0.8, 1.5, 3.0}, }, []string{"route", "device"}, ) // WebVitalsBeaconsTotal counts inbound RUM beacons regardless // of metric. Drives the "RUM stopped flowing" alert (no beacons // in 30m = frontend instrumentation broken or CDN blocking us). WebVitalsBeaconsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "veza_web_vitals_beacons_total", Help: "Total RUM beacons received broken down by metric name.", }, []string{"metric"}, ) // WebVitalsRejectedTotal counts beacons rejected for invalid // payload (bad metric name, out-of-range value, missing route). // Bumps on this counter mean a frontend regression or a hostile // caller trying to seed the metric. WebVitalsRejectedTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "veza_web_vitals_rejected_total", Help: "RUM beacons rejected during validation, by reason.", }, []string{"reason"}, ) ) // RecordWebVital dispatches a single metric observation into the // right histogram. metric is one of "LCP", "CLS", "INP", "FID", // "TTFB" (case-insensitive). value is the metric value as reported // by the browser, in milliseconds for time metrics and unitless // for CLS — the function converts ms→s where appropriate so the // histograms stay consistent (seconds + unitless for CLS). // // Returns false if the metric name is unknown ; the handler // increments WebVitalsRejectedTotal{reason="bad_metric"} on // false. func RecordWebVital(metric, route, device string, value float64) bool { WebVitalsBeaconsTotal.WithLabelValues(metric).Inc() switch metric { case "LCP": WebVitalsLCPSeconds.WithLabelValues(route, device).Observe(value / 1000.0) case "CLS": WebVitalsCLS.WithLabelValues(route, device).Observe(value) case "INP": WebVitalsINPSeconds.WithLabelValues(route, device).Observe(value / 1000.0) case "FID": WebVitalsFIDSeconds.WithLabelValues(route, device).Observe(value / 1000.0) case "TTFB": WebVitalsTTFBSeconds.WithLabelValues(route, device).Observe(value / 1000.0) default: return false } return true }