veza/veza-backend-api/internal/services/monitoring_alerting_service.go

package services

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/prometheus/client_golang/api"
	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
	"github.com/prometheus/common/model"
	"go.uber.org/zap"
)

// AlertSeverity represents the severity level of an alert
type AlertSeverity string

const (
	SeverityCritical AlertSeverity = "critical"
	SeverityWarning  AlertSeverity = "warning"
	SeverityInfo     AlertSeverity = "info"
)

// AlertStatus represents the status of an alert
type AlertStatus string

const (
	AlertStatusFiring   AlertStatus = "firing"
	AlertStatusResolved AlertStatus = "resolved"
	AlertStatusPending  AlertStatus = "pending"
)

// AlertRule represents a monitoring alert rule
type AlertRule struct {
	Name        string        `json:"name"`
	Query       string        `json:"query"`
	Threshold   float64       `json:"threshold"`
	Severity    AlertSeverity `json:"severity"`
	Duration    time.Duration `json:"duration"` // Duration before alert fires
	Description string        `json:"description"`
	Enabled     bool          `json:"enabled"`
}

// MonitoringAlert represents an active or resolved monitoring alert
type MonitoringAlert struct {
	RuleName   string        `json:"rule_name"`
	Severity   AlertSeverity `json:"severity"`
	Status     AlertStatus   `json:"status"`
	Value      float64       `json:"value"`
	Threshold  float64       `json:"threshold"`
	Message    string        `json:"message"`
	FiredAt    time.Time     `json:"fired_at,omitempty"`
	ResolvedAt time.Time     `json:"resolved_at,omitempty"`
}

// MonitoringAlertNotification represents a notification for a monitoring alert
type MonitoringAlertNotification struct {
	Alert     *MonitoringAlert       `json:"alert"`
	Channels  []string               `json:"channels"` // email, slack, webhook, etc.
	Metadata  map[string]interface{} `json:"metadata,omitempty"`
	Timestamp time.Time              `json:"timestamp"`
}

// MonitoringAlertingService provides monitoring and alerting capabilities
// BE-SVC-014: Implement monitoring and alerting
type MonitoringAlertingService struct {
	prometheusClient v1.API
	rules            []AlertRule
	activeAlerts     map[string]*MonitoringAlert
	mu               sync.RWMutex
	logger           *zap.Logger
	notificationFunc func(*MonitoringAlertNotification) error
}

// MonitoringConfig represents configuration for monitoring service
type MonitoringConfig struct {
	PrometheusURL string
	Logger        *zap.Logger
}

// NewMonitoringAlertingService creates a new monitoring and alerting service
func NewMonitoringAlertingService(config MonitoringConfig) (*MonitoringAlertingService, error) {
	if config.Logger == nil {
		config.Logger = zap.NewNop()
	}

	var promClient v1.API
	if config.PrometheusURL != "" {
		client, err := api.NewClient(api.Config{
			Address: config.PrometheusURL,
		})
		if err != nil {
			return nil, fmt.Errorf("failed to create Prometheus client: %w", err)
		}
		promClient = v1.NewAPI(client)
	}

	return &MonitoringAlertingService{
		prometheusClient: promClient,
		rules:            make([]AlertRule, 0),
		activeAlerts:     make(map[string]*MonitoringAlert),
		logger:           config.Logger,
	}, nil
}

// SetNotificationFunc sets the function to call when an alert fires
func (s *MonitoringAlertingService) SetNotificationFunc(fn func(*MonitoringAlertNotification) error) {
	s.mu.Lock()
	defer s.mu.Unlock()
	s.notificationFunc = fn
}

// AddAlertRule adds a new alert rule
func (s *MonitoringAlertingService) AddAlertRule(rule AlertRule) {
	s.mu.Lock()
	defer s.mu.Unlock()
	s.rules = append(s.rules, rule)
	s.logger.Info("Alert rule added",
		zap.String("name", rule.Name),
		zap.String("severity", string(rule.Severity)),
	)
}

// GetAlertRules returns all alert rules
func (s *MonitoringAlertingService) GetAlertRules() []AlertRule {
	s.mu.RLock()
	defer s.mu.RUnlock()
	rules := make([]AlertRule, len(s.rules))
	copy(rules, s.rules)
	return rules
}

// GetActiveAlerts returns all active alerts
func (s *MonitoringAlertingService) GetActiveAlerts() []*MonitoringAlert {
	s.mu.RLock()
	defer s.mu.RUnlock()
	alerts := make([]*MonitoringAlert, 0, len(s.activeAlerts))
	for _, alert := range s.activeAlerts {
		alerts = append(alerts, alert)
	}
	return alerts
}

// CheckAlerts evaluates all alert rules and triggers alerts if thresholds are exceeded
func (s *MonitoringAlertingService) CheckAlerts(ctx context.Context) error {
	if s.prometheusClient == nil {
		return fmt.Errorf("Prometheus client not configured")
	}

	s.mu.RLock()
	rules := make([]AlertRule, len(s.rules))
	copy(rules, s.rules)
	s.mu.RUnlock()

	for _, rule := range rules {
		if !rule.Enabled {
			continue
		}

		value, err := s.evaluateQuery(ctx, rule.Query)
		if err != nil {
			s.logger.Warn("Failed to evaluate alert rule",
				zap.String("rule", rule.Name),
				zap.Error(err),
			)
			continue
		}

		alertKey := rule.Name
		s.mu.Lock()
		existingAlert, exists := s.activeAlerts[alertKey]

		if value >= rule.Threshold {
			// Threshold exceeded
			if !exists || existingAlert.Status == AlertStatusResolved {
				// New alert or previously resolved
				alert := &MonitoringAlert{
					RuleName:  rule.Name,
					Severity:  rule.Severity,
					Status:    AlertStatusFiring,
					Value:     value,
					Threshold: rule.Threshold,
					Message:   fmt.Sprintf("%s: value %.2f exceeds threshold %.2f", rule.Description, value, rule.Threshold),
					FiredAt:   time.Now(),
				}
				s.activeAlerts[alertKey] = alert

				// Send notification
				if s.notificationFunc != nil {
					notification := &MonitoringAlertNotification{
						Alert:     alert,
						Channels:  []string{"email", "slack"},
						Timestamp: time.Now(),
					}
					if err := s.notificationFunc(notification); err != nil {
						s.logger.Error("Failed to send alert notification",
							zap.String("rule", rule.Name),
							zap.Error(err),
						)
					}
				}

				s.logger.Warn("Alert fired",
					zap.String("rule", rule.Name),
					zap.String("severity", string(rule.Severity)),
					zap.Float64("value", value),
					zap.Float64("threshold", rule.Threshold),
				)
			} else if exists && existingAlert.Status == AlertStatusFiring {
				// Alert still firing, update value
				existingAlert.Value = value
			}
		} else {
			// Threshold not exceeded
			if exists && existingAlert.Status == AlertStatusFiring {
				// Alert resolved
				existingAlert.Status = AlertStatusResolved
				existingAlert.ResolvedAt = time.Now()
				existingAlert.Message = fmt.Sprintf("%s: value %.2f is below threshold %.2f", rule.Description, value, rule.Threshold)

				s.logger.Info("Alert resolved",
					zap.String("rule", rule.Name),
					zap.Float64("value", value),
					zap.Float64("threshold", rule.Threshold),
				)
			}
		}
		s.mu.Unlock()
	}

	return nil
}

// evaluateQuery evaluates a Prometheus query and returns the value
func (s *MonitoringAlertingService) evaluateQuery(ctx context.Context, query string) (float64, error) {
	result, warnings, err := s.prometheusClient.Query(ctx, query, time.Now())
	if err != nil {
		return 0, fmt.Errorf("query failed: %w", err)
	}

	if len(warnings) > 0 {
		s.logger.Warn("Prometheus query warnings", zap.Strings("warnings", warnings))
	}

	if result.Type() != model.ValVector {
		return 0, fmt.Errorf("unexpected result type: %v", result.Type())
	}

	vector := result.(model.Vector)
	if len(vector) == 0 {
		return 0, fmt.Errorf("query returned no results")
	}

	// Return the first value
	return float64(vector[0].Value), nil
}

// StartMonitoring starts continuous monitoring with periodic alert checks
func (s *MonitoringAlertingService) StartMonitoring(ctx context.Context, interval time.Duration) error {
	if interval <= 0 {
		interval = 30 * time.Second // Default interval
	}

	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	s.logger.Info("Starting monitoring",
		zap.Duration("interval", interval),
		zap.Int("rules_count", len(s.rules)),
	)

	for {
		select {
		case <-ctx.Done():
			s.logger.Info("Monitoring stopped")
			return ctx.Err()
		case <-ticker.C:
			if err := s.CheckAlerts(ctx); err != nil {
				s.logger.Error("Failed to check alerts", zap.Error(err))
			}
		}
	}
}

// GetAlertByRuleName returns an alert by rule name
func (s *MonitoringAlertingService) GetAlertByRuleName(ruleName string) (*MonitoringAlert, bool) {
	s.mu.RLock()
	defer s.mu.RUnlock()
	alert, exists := s.activeAlerts[ruleName]
	return alert, exists
}

// ResolveAlert manually resolves an alert
func (s *MonitoringAlertingService) ResolveAlert(ruleName string) error {
	s.mu.Lock()
	defer s.mu.Unlock()

	alert, exists := s.activeAlerts[ruleName]
	if !exists {
		return fmt.Errorf("alert not found: %s", ruleName)
	}

	if alert.Status == AlertStatusResolved {
		return nil // Already resolved
	}

	alert.Status = AlertStatusResolved
	alert.ResolvedAt = time.Now()
	alert.Message = "Manually resolved"

	s.logger.Info("Alert manually resolved",
		zap.String("rule", ruleName),
	)

	return nil
}

// GetDefaultAlertRules returns a set of default alert rules for common metrics
func GetDefaultAlertRules() []AlertRule {
	return []AlertRule{
		{
			Name:        "high_error_rate",
			Query:       "rate(veza_errors_total[5m]) > 0.1",
			Threshold:   0.1,
			Severity:    SeverityCritical,
			Duration:    5 * time.Minute,
			Description: "High error rate detected",
			Enabled:     true,
		},
		{
			Name:        "high_response_time",
			Query:       "histogram_quantile(0.95, rate(veza_http_request_duration_seconds_bucket[5m])) > 1.0",
			Threshold:   1.0,
			Severity:    SeverityWarning,
			Duration:    5 * time.Minute,
			Description: "High response time detected",
			Enabled:     true,
		},
		{
			Name:        "database_connection_pool_exhausted",
			Query:       "veza_database_connections_active / veza_database_connections_max > 0.9",
			Threshold:   0.9,
			Severity:    SeverityCritical,
			Duration:    2 * time.Minute,
			Description: "Database connection pool nearly exhausted",
			Enabled:     true,
		},
		{
			Name:        "high_memory_usage",
			Query:       "process_resident_memory_bytes / 1024 / 1024 / 1024 > 2",
			Threshold:   2.0,
			Severity:    SeverityWarning,
			Duration:    5 * time.Minute,
			Description: "High memory usage detected",
			Enabled:     true,
		},
	}
}