Real User Monitoring closes the gap between synthetic probes (which
already cover server-side latency) and what users actually see in
their browsers. Slow CDN edges, third-party scripts, mobile-CPU
regressions, and bundle bloat all surface here but stay invisible
to backend-side dashboards.
Frontend (apps/web) :
- web-vitals@^4.2.4 dep
- src/observability/webVitals.ts collects LCP / CLS / INP / FID /
TTFB via the npm web-vitals package and POSTs to the backend
using sendBeacon (with fetch keepalive fallback)
- Pageload-level sampling decision (flip a coin once, contribute
all metrics or none) avoids per-metric histogram bias
- Sample rate via VITE_RUM_SAMPLE_RATE (default 1.0 dev / 0.25 prod)
- main.tsx wires initWebVitals() right after initSentry()
- Route slug derived client-side (strips uuid-ish + numeric ids
to keep cardinality low)
Backend :
- internal/handlers/web_vitals_handler.go : POST
/api/v1/observability/web-vitals — anonymous, IP rate-limited
(reuses FrontendLogRateLimit), validates value ranges, normalizes
route + device labels for cardinality
- internal/monitoring/web_vitals.go : Prometheus histograms with
buckets aligned to Google's good/needs-improvement/poor
thresholds, plus beacons-received / beacons-rejected counters
- Tests : 6 handler tests + 3 helper-function tests + 10 frontend
vitest tests (all pass)
Alerts in alert_rules.yml veza_rum group :
- WebVitalsLCPP75Poor (p75 LCP > 4s on a route+device for 30m)
- WebVitalsCLSP75Poor (p75 CLS > 0.25 for 30m)
- WebVitalsINPP75Poor (p75 INP > 500ms for 30m)
- WebVitalsBeaconsStopped (zero beacons for 30m vs yesterday)
Cardinality discipline : labels are bounded to {route, device}
where route is alnum/dash, ≤32 chars, and device is one of
mobile/desktop/tablet/unknown. No per-user labels.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
145 lines
5 KiB
Go
145 lines
5 KiB
Go
package handlers
|
||
|
||
import (
|
||
"net/http"
|
||
"strings"
|
||
|
||
"veza-backend-api/internal/monitoring"
|
||
|
||
"github.com/gin-gonic/gin"
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
// WebVitalsBeacon is the inbound RUM payload from apps/web/src/observability/webVitals.ts.
|
||
// Mirrors the npm `web-vitals` package's Metric type plus a `route` slug
|
||
// and a `device` form-factor inferred client-side from window.matchMedia.
|
||
//
|
||
// Why route + device labels :
|
||
// - Different page types have different LCP budgets (a player page
|
||
// loads media + waveform, the homepage is markup only).
|
||
// - Mobile vs desktop have different perf budgets ; aggregated
|
||
// p75 hides device-specific regressions.
|
||
//
|
||
// Why NOT user_id / session_id in the labels : cardinality. We
|
||
// log them in the access log if needed for forensics ; the
|
||
// histograms stay aggregated.
|
||
type WebVitalsBeacon struct {
|
||
Metric string `json:"metric"` // LCP / CLS / INP / FID / TTFB
|
||
Value float64 `json:"value"` // ms for time metrics, unitless for CLS
|
||
Route string `json:"route"` // route slug — e.g. "home", "player", "search"
|
||
Device string `json:"device,omitempty"` // mobile / desktop / tablet
|
||
// Rating is an optional client-side classification ("good" /
|
||
// "needs-improvement" / "poor"). We don't index on it (the
|
||
// histogram tells us the same thing more flexibly) but log
|
||
// it for ops correlation.
|
||
Rating string `json:"rating,omitempty"`
|
||
}
|
||
|
||
// WebVitalsHandler returns a handler for POST /api/v1/observability/web-vitals.
|
||
//
|
||
// Anonymous endpoint by design — RUM beacons come from any visiting
|
||
// browser including unauthenticated homepage hits. Rate-limited via
|
||
// the global IP rate limiter (covers the abuse case). Cardinality is
|
||
// bounded by the route truncation logic below.
|
||
//
|
||
// @Summary Submit a Web Vitals RUM beacon
|
||
// @Description Accept a single Core Web Vitals measurement from the browser and feed it into the RUM histograms.
|
||
// @Tags Observability
|
||
// @Accept json
|
||
// @Produce json
|
||
// @Param body body WebVitalsBeacon true "RUM beacon"
|
||
// @Success 204
|
||
// @Failure 400 {object} map[string]interface{}
|
||
// @Router /observability/web-vitals [post]
|
||
func WebVitalsHandler(logger *zap.Logger) gin.HandlerFunc {
|
||
return func(c *gin.Context) {
|
||
var beacon WebVitalsBeacon
|
||
if err := c.ShouldBindJSON(&beacon); err != nil {
|
||
monitoring.WebVitalsRejectedTotal.WithLabelValues("bad_json").Inc()
|
||
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid payload"})
|
||
return
|
||
}
|
||
|
||
metric := strings.ToUpper(strings.TrimSpace(beacon.Metric))
|
||
route := normalizeRouteLabel(beacon.Route)
|
||
device := normalizeDeviceLabel(beacon.Device)
|
||
|
||
// v1.0.10 ops item 9 — value sanity check. CLS is unitless and
|
||
// rarely above 1.0 ; everything else is milliseconds and the
|
||
// browser caps make 60_000 ms a hard upper bound. Reject the
|
||
// outliers : they're either a clock skew bug or an attacker
|
||
// seeding the histogram with junk.
|
||
if !isValidVitalValue(metric, beacon.Value) {
|
||
monitoring.WebVitalsRejectedTotal.WithLabelValues("bad_value").Inc()
|
||
c.JSON(http.StatusBadRequest, gin.H{"error": "value out of range"})
|
||
return
|
||
}
|
||
|
||
if !monitoring.RecordWebVital(metric, route, device, beacon.Value) {
|
||
monitoring.WebVitalsRejectedTotal.WithLabelValues("bad_metric").Inc()
|
||
c.JSON(http.StatusBadRequest, gin.H{"error": "unknown metric"})
|
||
return
|
||
}
|
||
|
||
// Debug-level so the access log isn't drowned ; production
|
||
// volume can hit thousands of beacons per minute.
|
||
logger.Debug("rum beacon",
|
||
zap.String("metric", metric),
|
||
zap.String("route", route),
|
||
zap.String("device", device),
|
||
zap.Float64("value", beacon.Value),
|
||
zap.String("rating", beacon.Rating))
|
||
|
||
c.Status(http.StatusNoContent)
|
||
}
|
||
}
|
||
|
||
// normalizeRouteLabel caps cardinality. The frontend should already
|
||
// send a slug ("player", "home", "search"), but a hostile caller
|
||
// could submit anything ; cap at 32 chars + lowercase + alnum/dash.
|
||
// Empty / unknown → "unknown".
|
||
func normalizeRouteLabel(s string) string {
|
||
s = strings.TrimSpace(strings.ToLower(s))
|
||
if s == "" {
|
||
return "unknown"
|
||
}
|
||
if len(s) > 32 {
|
||
s = s[:32]
|
||
}
|
||
out := make([]rune, 0, len(s))
|
||
for _, r := range s {
|
||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' {
|
||
out = append(out, r)
|
||
}
|
||
}
|
||
if len(out) == 0 {
|
||
return "unknown"
|
||
}
|
||
return string(out)
|
||
}
|
||
|
||
// normalizeDeviceLabel collapses anything else into "unknown" so
|
||
// "Mobile-Safari/iPhone-13" doesn't blow up cardinality.
|
||
func normalizeDeviceLabel(s string) string {
|
||
switch strings.ToLower(strings.TrimSpace(s)) {
|
||
case "mobile", "desktop", "tablet":
|
||
return strings.ToLower(s)
|
||
default:
|
||
return "unknown"
|
||
}
|
||
}
|
||
|
||
// isValidVitalValue gates obviously-bogus values so the histograms
|
||
// stay readable. Per-metric ranges :
|
||
// - CLS : 0–10 (real values are <1.0 ; 10 is generous)
|
||
// - others (LCP/INP/FID/TTFB) : 0–60_000 ms (1 minute)
|
||
// Negative values are nonsense from a clock skew bug.
|
||
func isValidVitalValue(metric string, v float64) bool {
|
||
if v < 0 {
|
||
return false
|
||
}
|
||
if metric == "CLS" {
|
||
return v <= 10
|
||
}
|
||
return v <= 60_000
|
||
}
|