package services import ( "context" "fmt" "sync" "time" "github.com/prometheus/client_golang/api" v1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" "go.uber.org/zap" ) // AlertSeverity represents the severity level of an alert type AlertSeverity string const ( SeverityCritical AlertSeverity = "critical" SeverityWarning AlertSeverity = "warning" SeverityInfo AlertSeverity = "info" ) // AlertStatus represents the status of an alert type AlertStatus string const ( AlertStatusFiring AlertStatus = "firing" AlertStatusResolved AlertStatus = "resolved" AlertStatusPending AlertStatus = "pending" ) // AlertRule represents a monitoring alert rule type AlertRule struct { Name string `json:"name"` Query string `json:"query"` Threshold float64 `json:"threshold"` Severity AlertSeverity `json:"severity"` Duration time.Duration `json:"duration"` // Duration before alert fires Description string `json:"description"` Enabled bool `json:"enabled"` } // MonitoringAlert represents an active or resolved monitoring alert type MonitoringAlert struct { RuleName string `json:"rule_name"` Severity AlertSeverity `json:"severity"` Status AlertStatus `json:"status"` Value float64 `json:"value"` Threshold float64 `json:"threshold"` Message string `json:"message"` FiredAt time.Time `json:"fired_at,omitempty"` ResolvedAt time.Time `json:"resolved_at,omitempty"` } // MonitoringAlertNotification represents a notification for a monitoring alert type MonitoringAlertNotification struct { Alert *MonitoringAlert `json:"alert"` Channels []string `json:"channels"` // email, slack, webhook, etc. Metadata map[string]interface{} `json:"metadata,omitempty"` Timestamp time.Time `json:"timestamp"` } // MonitoringAlertingService provides monitoring and alerting capabilities // BE-SVC-014: Implement monitoring and alerting type MonitoringAlertingService struct { prometheusClient v1.API rules []AlertRule activeAlerts map[string]*MonitoringAlert mu sync.RWMutex logger *zap.Logger notificationFunc func(*MonitoringAlertNotification) error } // MonitoringConfig represents configuration for monitoring service type MonitoringConfig struct { PrometheusURL string Logger *zap.Logger } // NewMonitoringAlertingService creates a new monitoring and alerting service func NewMonitoringAlertingService(config MonitoringConfig) (*MonitoringAlertingService, error) { if config.Logger == nil { config.Logger = zap.NewNop() } var promClient v1.API if config.PrometheusURL != "" { client, err := api.NewClient(api.Config{ Address: config.PrometheusURL, }) if err != nil { return nil, fmt.Errorf("failed to create Prometheus client: %w", err) } promClient = v1.NewAPI(client) } return &MonitoringAlertingService{ prometheusClient: promClient, rules: make([]AlertRule, 0), activeAlerts: make(map[string]*MonitoringAlert), logger: config.Logger, }, nil } // SetNotificationFunc sets the function to call when an alert fires func (s *MonitoringAlertingService) SetNotificationFunc(fn func(*MonitoringAlertNotification) error) { s.mu.Lock() defer s.mu.Unlock() s.notificationFunc = fn } // AddAlertRule adds a new alert rule func (s *MonitoringAlertingService) AddAlertRule(rule AlertRule) { s.mu.Lock() defer s.mu.Unlock() s.rules = append(s.rules, rule) s.logger.Info("Alert rule added", zap.String("name", rule.Name), zap.String("severity", string(rule.Severity)), ) } // GetAlertRules returns all alert rules func (s *MonitoringAlertingService) GetAlertRules() []AlertRule { s.mu.RLock() defer s.mu.RUnlock() rules := make([]AlertRule, len(s.rules)) copy(rules, s.rules) return rules } // GetActiveAlerts returns all active alerts func (s *MonitoringAlertingService) GetActiveAlerts() []*MonitoringAlert { s.mu.RLock() defer s.mu.RUnlock() alerts := make([]*MonitoringAlert, 0, len(s.activeAlerts)) for _, alert := range s.activeAlerts { alerts = append(alerts, alert) } return alerts } // CheckAlerts evaluates all alert rules and triggers alerts if thresholds are exceeded func (s *MonitoringAlertingService) CheckAlerts(ctx context.Context) error { if s.prometheusClient == nil { return fmt.Errorf("Prometheus client not configured") } s.mu.RLock() rules := make([]AlertRule, len(s.rules)) copy(rules, s.rules) s.mu.RUnlock() for _, rule := range rules { if !rule.Enabled { continue } value, err := s.evaluateQuery(ctx, rule.Query) if err != nil { s.logger.Warn("Failed to evaluate alert rule", zap.String("rule", rule.Name), zap.Error(err), ) continue } alertKey := rule.Name s.mu.Lock() existingAlert, exists := s.activeAlerts[alertKey] if value >= rule.Threshold { // Threshold exceeded if !exists || existingAlert.Status == AlertStatusResolved { // New alert or previously resolved alert := &MonitoringAlert{ RuleName: rule.Name, Severity: rule.Severity, Status: AlertStatusFiring, Value: value, Threshold: rule.Threshold, Message: fmt.Sprintf("%s: value %.2f exceeds threshold %.2f", rule.Description, value, rule.Threshold), FiredAt: time.Now(), } s.activeAlerts[alertKey] = alert // Send notification if s.notificationFunc != nil { notification := &MonitoringAlertNotification{ Alert: alert, Channels: []string{"email", "slack"}, Timestamp: time.Now(), } if err := s.notificationFunc(notification); err != nil { s.logger.Error("Failed to send alert notification", zap.String("rule", rule.Name), zap.Error(err), ) } } s.logger.Warn("Alert fired", zap.String("rule", rule.Name), zap.String("severity", string(rule.Severity)), zap.Float64("value", value), zap.Float64("threshold", rule.Threshold), ) } else if exists && existingAlert.Status == AlertStatusFiring { // Alert still firing, update value existingAlert.Value = value } } else { // Threshold not exceeded if exists && existingAlert.Status == AlertStatusFiring { // Alert resolved existingAlert.Status = AlertStatusResolved existingAlert.ResolvedAt = time.Now() existingAlert.Message = fmt.Sprintf("%s: value %.2f is below threshold %.2f", rule.Description, value, rule.Threshold) s.logger.Info("Alert resolved", zap.String("rule", rule.Name), zap.Float64("value", value), zap.Float64("threshold", rule.Threshold), ) } } s.mu.Unlock() } return nil } // evaluateQuery evaluates a Prometheus query and returns the value func (s *MonitoringAlertingService) evaluateQuery(ctx context.Context, query string) (float64, error) { result, warnings, err := s.prometheusClient.Query(ctx, query, time.Now()) if err != nil { return 0, fmt.Errorf("query failed: %w", err) } if len(warnings) > 0 { s.logger.Warn("Prometheus query warnings", zap.Strings("warnings", warnings)) } if result.Type() != model.ValVector { return 0, fmt.Errorf("unexpected result type: %v", result.Type()) } vector := result.(model.Vector) if len(vector) == 0 { return 0, fmt.Errorf("query returned no results") } // Return the first value return float64(vector[0].Value), nil } // StartMonitoring starts continuous monitoring with periodic alert checks func (s *MonitoringAlertingService) StartMonitoring(ctx context.Context, interval time.Duration) error { if interval <= 0 { interval = 30 * time.Second // Default interval } ticker := time.NewTicker(interval) defer ticker.Stop() s.logger.Info("Starting monitoring", zap.Duration("interval", interval), zap.Int("rules_count", len(s.rules)), ) for { select { case <-ctx.Done(): s.logger.Info("Monitoring stopped") return ctx.Err() case <-ticker.C: if err := s.CheckAlerts(ctx); err != nil { s.logger.Error("Failed to check alerts", zap.Error(err)) } } } } // GetAlertByRuleName returns an alert by rule name func (s *MonitoringAlertingService) GetAlertByRuleName(ruleName string) (*MonitoringAlert, bool) { s.mu.RLock() defer s.mu.RUnlock() alert, exists := s.activeAlerts[ruleName] return alert, exists } // ResolveAlert manually resolves an alert func (s *MonitoringAlertingService) ResolveAlert(ruleName string) error { s.mu.Lock() defer s.mu.Unlock() alert, exists := s.activeAlerts[ruleName] if !exists { return fmt.Errorf("alert not found: %s", ruleName) } if alert.Status == AlertStatusResolved { return nil // Already resolved } alert.Status = AlertStatusResolved alert.ResolvedAt = time.Now() alert.Message = "Manually resolved" s.logger.Info("Alert manually resolved", zap.String("rule", ruleName), ) return nil } // GetDefaultAlertRules returns a set of default alert rules for common metrics func GetDefaultAlertRules() []AlertRule { return []AlertRule{ { Name: "high_error_rate", Query: "rate(veza_errors_total[5m]) > 0.1", Threshold: 0.1, Severity: SeverityCritical, Duration: 5 * time.Minute, Description: "High error rate detected", Enabled: true, }, { Name: "high_response_time", Query: "histogram_quantile(0.95, rate(veza_http_request_duration_seconds_bucket[5m])) > 1.0", Threshold: 1.0, Severity: SeverityWarning, Duration: 5 * time.Minute, Description: "High response time detected", Enabled: true, }, { Name: "database_connection_pool_exhausted", Query: "veza_database_connections_active / veza_database_connections_max > 0.9", Threshold: 0.9, Severity: SeverityCritical, Duration: 2 * time.Minute, Description: "Database connection pool nearly exhausted", Enabled: true, }, { Name: "high_memory_usage", Query: "process_resident_memory_bytes / 1024 / 1024 / 1024 > 2", Threshold: 2.0, Severity: SeverityWarning, Duration: 5 * time.Minute, Description: "High memory usage detected", Enabled: true, }, } }