veza/veza-backend-api/internal/services/monitoring_alerting_service.go

358 lines
10 KiB
Go

package services
import (
"context"
"fmt"
"sync"
"time"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"go.uber.org/zap"
)
// AlertSeverity represents the severity level of an alert
type AlertSeverity string
const (
SeverityCritical AlertSeverity = "critical"
SeverityWarning AlertSeverity = "warning"
SeverityInfo AlertSeverity = "info"
)
// AlertStatus represents the status of an alert
type AlertStatus string
const (
AlertStatusFiring AlertStatus = "firing"
AlertStatusResolved AlertStatus = "resolved"
AlertStatusPending AlertStatus = "pending"
)
// AlertRule represents a monitoring alert rule
type AlertRule struct {
Name string `json:"name"`
Query string `json:"query"`
Threshold float64 `json:"threshold"`
Severity AlertSeverity `json:"severity"`
Duration time.Duration `json:"duration"` // Duration before alert fires
Description string `json:"description"`
Enabled bool `json:"enabled"`
}
// MonitoringAlert represents an active or resolved monitoring alert
type MonitoringAlert struct {
RuleName string `json:"rule_name"`
Severity AlertSeverity `json:"severity"`
Status AlertStatus `json:"status"`
Value float64 `json:"value"`
Threshold float64 `json:"threshold"`
Message string `json:"message"`
FiredAt time.Time `json:"fired_at,omitempty"`
ResolvedAt time.Time `json:"resolved_at,omitempty"`
}
// MonitoringAlertNotification represents a notification for a monitoring alert
type MonitoringAlertNotification struct {
Alert *MonitoringAlert `json:"alert"`
Channels []string `json:"channels"` // email, slack, webhook, etc.
Metadata map[string]interface{} `json:"metadata,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
// MonitoringAlertingService provides monitoring and alerting capabilities
// BE-SVC-014: Implement monitoring and alerting
type MonitoringAlertingService struct {
prometheusClient v1.API
rules []AlertRule
activeAlerts map[string]*MonitoringAlert
mu sync.RWMutex
logger *zap.Logger
notificationFunc func(*MonitoringAlertNotification) error
}
// MonitoringConfig represents configuration for monitoring service
type MonitoringConfig struct {
PrometheusURL string
Logger *zap.Logger
}
// NewMonitoringAlertingService creates a new monitoring and alerting service
func NewMonitoringAlertingService(config MonitoringConfig) (*MonitoringAlertingService, error) {
if config.Logger == nil {
config.Logger = zap.NewNop()
}
var promClient v1.API
if config.PrometheusURL != "" {
client, err := api.NewClient(api.Config{
Address: config.PrometheusURL,
})
if err != nil {
return nil, fmt.Errorf("failed to create Prometheus client: %w", err)
}
promClient = v1.NewAPI(client)
}
return &MonitoringAlertingService{
prometheusClient: promClient,
rules: make([]AlertRule, 0),
activeAlerts: make(map[string]*MonitoringAlert),
logger: config.Logger,
}, nil
}
// SetNotificationFunc sets the function to call when an alert fires
func (s *MonitoringAlertingService) SetNotificationFunc(fn func(*MonitoringAlertNotification) error) {
s.mu.Lock()
defer s.mu.Unlock()
s.notificationFunc = fn
}
// AddAlertRule adds a new alert rule
func (s *MonitoringAlertingService) AddAlertRule(rule AlertRule) {
s.mu.Lock()
defer s.mu.Unlock()
s.rules = append(s.rules, rule)
s.logger.Info("Alert rule added",
zap.String("name", rule.Name),
zap.String("severity", string(rule.Severity)),
)
}
// GetAlertRules returns all alert rules
func (s *MonitoringAlertingService) GetAlertRules() []AlertRule {
s.mu.RLock()
defer s.mu.RUnlock()
rules := make([]AlertRule, len(s.rules))
copy(rules, s.rules)
return rules
}
// GetActiveAlerts returns all active alerts
func (s *MonitoringAlertingService) GetActiveAlerts() []*MonitoringAlert {
s.mu.RLock()
defer s.mu.RUnlock()
alerts := make([]*MonitoringAlert, 0, len(s.activeAlerts))
for _, alert := range s.activeAlerts {
alerts = append(alerts, alert)
}
return alerts
}
// CheckAlerts evaluates all alert rules and triggers alerts if thresholds are exceeded
func (s *MonitoringAlertingService) CheckAlerts(ctx context.Context) error {
if s.prometheusClient == nil {
return fmt.Errorf("Prometheus client not configured")
}
s.mu.RLock()
rules := make([]AlertRule, len(s.rules))
copy(rules, s.rules)
s.mu.RUnlock()
for _, rule := range rules {
if !rule.Enabled {
continue
}
value, err := s.evaluateQuery(ctx, rule.Query)
if err != nil {
s.logger.Warn("Failed to evaluate alert rule",
zap.String("rule", rule.Name),
zap.Error(err),
)
continue
}
alertKey := rule.Name
s.mu.Lock()
existingAlert, exists := s.activeAlerts[alertKey]
if value >= rule.Threshold {
// Threshold exceeded
if !exists || existingAlert.Status == AlertStatusResolved {
// New alert or previously resolved
alert := &MonitoringAlert{
RuleName: rule.Name,
Severity: rule.Severity,
Status: AlertStatusFiring,
Value: value,
Threshold: rule.Threshold,
Message: fmt.Sprintf("%s: value %.2f exceeds threshold %.2f", rule.Description, value, rule.Threshold),
FiredAt: time.Now(),
}
s.activeAlerts[alertKey] = alert
// Send notification
if s.notificationFunc != nil {
notification := &MonitoringAlertNotification{
Alert: alert,
Channels: []string{"email", "slack"},
Timestamp: time.Now(),
}
if err := s.notificationFunc(notification); err != nil {
s.logger.Error("Failed to send alert notification",
zap.String("rule", rule.Name),
zap.Error(err),
)
}
}
s.logger.Warn("Alert fired",
zap.String("rule", rule.Name),
zap.String("severity", string(rule.Severity)),
zap.Float64("value", value),
zap.Float64("threshold", rule.Threshold),
)
} else if exists && existingAlert.Status == AlertStatusFiring {
// Alert still firing, update value
existingAlert.Value = value
}
} else {
// Threshold not exceeded
if exists && existingAlert.Status == AlertStatusFiring {
// Alert resolved
existingAlert.Status = AlertStatusResolved
existingAlert.ResolvedAt = time.Now()
existingAlert.Message = fmt.Sprintf("%s: value %.2f is below threshold %.2f", rule.Description, value, rule.Threshold)
s.logger.Info("Alert resolved",
zap.String("rule", rule.Name),
zap.Float64("value", value),
zap.Float64("threshold", rule.Threshold),
)
}
}
s.mu.Unlock()
}
return nil
}
// evaluateQuery evaluates a Prometheus query and returns the value
func (s *MonitoringAlertingService) evaluateQuery(ctx context.Context, query string) (float64, error) {
result, warnings, err := s.prometheusClient.Query(ctx, query, time.Now())
if err != nil {
return 0, fmt.Errorf("query failed: %w", err)
}
if len(warnings) > 0 {
s.logger.Warn("Prometheus query warnings", zap.Strings("warnings", warnings))
}
if result.Type() != model.ValVector {
return 0, fmt.Errorf("unexpected result type: %v", result.Type())
}
vector := result.(model.Vector)
if len(vector) == 0 {
return 0, fmt.Errorf("query returned no results")
}
// Return the first value
return float64(vector[0].Value), nil
}
// StartMonitoring starts continuous monitoring with periodic alert checks
func (s *MonitoringAlertingService) StartMonitoring(ctx context.Context, interval time.Duration) error {
if interval <= 0 {
interval = 30 * time.Second // Default interval
}
ticker := time.NewTicker(interval)
defer ticker.Stop()
s.logger.Info("Starting monitoring",
zap.Duration("interval", interval),
zap.Int("rules_count", len(s.rules)),
)
for {
select {
case <-ctx.Done():
s.logger.Info("Monitoring stopped")
return ctx.Err()
case <-ticker.C:
if err := s.CheckAlerts(ctx); err != nil {
s.logger.Error("Failed to check alerts", zap.Error(err))
}
}
}
}
// GetAlertByRuleName returns an alert by rule name
func (s *MonitoringAlertingService) GetAlertByRuleName(ruleName string) (*MonitoringAlert, bool) {
s.mu.RLock()
defer s.mu.RUnlock()
alert, exists := s.activeAlerts[ruleName]
return alert, exists
}
// ResolveAlert manually resolves an alert
func (s *MonitoringAlertingService) ResolveAlert(ruleName string) error {
s.mu.Lock()
defer s.mu.Unlock()
alert, exists := s.activeAlerts[ruleName]
if !exists {
return fmt.Errorf("alert not found: %s", ruleName)
}
if alert.Status == AlertStatusResolved {
return nil // Already resolved
}
alert.Status = AlertStatusResolved
alert.ResolvedAt = time.Now()
alert.Message = "Manually resolved"
s.logger.Info("Alert manually resolved",
zap.String("rule", ruleName),
)
return nil
}
// GetDefaultAlertRules returns a set of default alert rules for common metrics
func GetDefaultAlertRules() []AlertRule {
return []AlertRule{
{
Name: "high_error_rate",
Query: "rate(veza_errors_total[5m]) > 0.1",
Threshold: 0.1,
Severity: SeverityCritical,
Duration: 5 * time.Minute,
Description: "High error rate detected",
Enabled: true,
},
{
Name: "high_response_time",
Query: "histogram_quantile(0.95, rate(veza_http_request_duration_seconds_bucket[5m])) > 1.0",
Threshold: 1.0,
Severity: SeverityWarning,
Duration: 5 * time.Minute,
Description: "High response time detected",
Enabled: true,
},
{
Name: "database_connection_pool_exhausted",
Query: "veza_database_connections_active / veza_database_connections_max > 0.9",
Threshold: 0.9,
Severity: SeverityCritical,
Duration: 2 * time.Minute,
Description: "Database connection pool nearly exhausted",
Enabled: true,
},
{
Name: "high_memory_usage",
Query: "process_resident_memory_bytes / 1024 / 1024 / 1024 > 2",
Threshold: 2.0,
Severity: SeverityWarning,
Duration: 5 * time.Minute,
Description: "High memory usage detected",
Enabled: true,
},
}
}