358 lines
10 KiB
Go
358 lines
10 KiB
Go
package services
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/api"
|
|
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
|
"github.com/prometheus/common/model"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// AlertSeverity represents the severity level of an alert
|
|
type AlertSeverity string
|
|
|
|
const (
|
|
SeverityCritical AlertSeverity = "critical"
|
|
SeverityWarning AlertSeverity = "warning"
|
|
SeverityInfo AlertSeverity = "info"
|
|
)
|
|
|
|
// AlertStatus represents the status of an alert
|
|
type AlertStatus string
|
|
|
|
const (
|
|
AlertStatusFiring AlertStatus = "firing"
|
|
AlertStatusResolved AlertStatus = "resolved"
|
|
AlertStatusPending AlertStatus = "pending"
|
|
)
|
|
|
|
// AlertRule represents a monitoring alert rule
|
|
type AlertRule struct {
|
|
Name string `json:"name"`
|
|
Query string `json:"query"`
|
|
Threshold float64 `json:"threshold"`
|
|
Severity AlertSeverity `json:"severity"`
|
|
Duration time.Duration `json:"duration"` // Duration before alert fires
|
|
Description string `json:"description"`
|
|
Enabled bool `json:"enabled"`
|
|
}
|
|
|
|
// MonitoringAlert represents an active or resolved monitoring alert
|
|
type MonitoringAlert struct {
|
|
RuleName string `json:"rule_name"`
|
|
Severity AlertSeverity `json:"severity"`
|
|
Status AlertStatus `json:"status"`
|
|
Value float64 `json:"value"`
|
|
Threshold float64 `json:"threshold"`
|
|
Message string `json:"message"`
|
|
FiredAt time.Time `json:"fired_at,omitempty"`
|
|
ResolvedAt time.Time `json:"resolved_at,omitempty"`
|
|
}
|
|
|
|
// MonitoringAlertNotification represents a notification for a monitoring alert
|
|
type MonitoringAlertNotification struct {
|
|
Alert *MonitoringAlert `json:"alert"`
|
|
Channels []string `json:"channels"` // email, slack, webhook, etc.
|
|
Metadata map[string]interface{} `json:"metadata,omitempty"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// MonitoringAlertingService provides monitoring and alerting capabilities
|
|
// BE-SVC-014: Implement monitoring and alerting
|
|
type MonitoringAlertingService struct {
|
|
prometheusClient v1.API
|
|
rules []AlertRule
|
|
activeAlerts map[string]*MonitoringAlert
|
|
mu sync.RWMutex
|
|
logger *zap.Logger
|
|
notificationFunc func(*MonitoringAlertNotification) error
|
|
}
|
|
|
|
// MonitoringConfig represents configuration for monitoring service
|
|
type MonitoringConfig struct {
|
|
PrometheusURL string
|
|
Logger *zap.Logger
|
|
}
|
|
|
|
// NewMonitoringAlertingService creates a new monitoring and alerting service
|
|
func NewMonitoringAlertingService(config MonitoringConfig) (*MonitoringAlertingService, error) {
|
|
if config.Logger == nil {
|
|
config.Logger = zap.NewNop()
|
|
}
|
|
|
|
var promClient v1.API
|
|
if config.PrometheusURL != "" {
|
|
client, err := api.NewClient(api.Config{
|
|
Address: config.PrometheusURL,
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create Prometheus client: %w", err)
|
|
}
|
|
promClient = v1.NewAPI(client)
|
|
}
|
|
|
|
return &MonitoringAlertingService{
|
|
prometheusClient: promClient,
|
|
rules: make([]AlertRule, 0),
|
|
activeAlerts: make(map[string]*MonitoringAlert),
|
|
logger: config.Logger,
|
|
}, nil
|
|
}
|
|
|
|
// SetNotificationFunc sets the function to call when an alert fires
|
|
func (s *MonitoringAlertingService) SetNotificationFunc(fn func(*MonitoringAlertNotification) error) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.notificationFunc = fn
|
|
}
|
|
|
|
// AddAlertRule adds a new alert rule
|
|
func (s *MonitoringAlertingService) AddAlertRule(rule AlertRule) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.rules = append(s.rules, rule)
|
|
s.logger.Info("Alert rule added",
|
|
zap.String("name", rule.Name),
|
|
zap.String("severity", string(rule.Severity)),
|
|
)
|
|
}
|
|
|
|
// GetAlertRules returns all alert rules
|
|
func (s *MonitoringAlertingService) GetAlertRules() []AlertRule {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
rules := make([]AlertRule, len(s.rules))
|
|
copy(rules, s.rules)
|
|
return rules
|
|
}
|
|
|
|
// GetActiveAlerts returns all active alerts
|
|
func (s *MonitoringAlertingService) GetActiveAlerts() []*MonitoringAlert {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
alerts := make([]*MonitoringAlert, 0, len(s.activeAlerts))
|
|
for _, alert := range s.activeAlerts {
|
|
alerts = append(alerts, alert)
|
|
}
|
|
return alerts
|
|
}
|
|
|
|
// CheckAlerts evaluates all alert rules and triggers alerts if thresholds are exceeded
|
|
func (s *MonitoringAlertingService) CheckAlerts(ctx context.Context) error {
|
|
if s.prometheusClient == nil {
|
|
return fmt.Errorf("Prometheus client not configured")
|
|
}
|
|
|
|
s.mu.RLock()
|
|
rules := make([]AlertRule, len(s.rules))
|
|
copy(rules, s.rules)
|
|
s.mu.RUnlock()
|
|
|
|
for _, rule := range rules {
|
|
if !rule.Enabled {
|
|
continue
|
|
}
|
|
|
|
value, err := s.evaluateQuery(ctx, rule.Query)
|
|
if err != nil {
|
|
s.logger.Warn("Failed to evaluate alert rule",
|
|
zap.String("rule", rule.Name),
|
|
zap.Error(err),
|
|
)
|
|
continue
|
|
}
|
|
|
|
alertKey := rule.Name
|
|
s.mu.Lock()
|
|
existingAlert, exists := s.activeAlerts[alertKey]
|
|
|
|
if value >= rule.Threshold {
|
|
// Threshold exceeded
|
|
if !exists || existingAlert.Status == AlertStatusResolved {
|
|
// New alert or previously resolved
|
|
alert := &MonitoringAlert{
|
|
RuleName: rule.Name,
|
|
Severity: rule.Severity,
|
|
Status: AlertStatusFiring,
|
|
Value: value,
|
|
Threshold: rule.Threshold,
|
|
Message: fmt.Sprintf("%s: value %.2f exceeds threshold %.2f", rule.Description, value, rule.Threshold),
|
|
FiredAt: time.Now(),
|
|
}
|
|
s.activeAlerts[alertKey] = alert
|
|
|
|
// Send notification
|
|
if s.notificationFunc != nil {
|
|
notification := &MonitoringAlertNotification{
|
|
Alert: alert,
|
|
Channels: []string{"email", "slack"},
|
|
Timestamp: time.Now(),
|
|
}
|
|
if err := s.notificationFunc(notification); err != nil {
|
|
s.logger.Error("Failed to send alert notification",
|
|
zap.String("rule", rule.Name),
|
|
zap.Error(err),
|
|
)
|
|
}
|
|
}
|
|
|
|
s.logger.Warn("Alert fired",
|
|
zap.String("rule", rule.Name),
|
|
zap.String("severity", string(rule.Severity)),
|
|
zap.Float64("value", value),
|
|
zap.Float64("threshold", rule.Threshold),
|
|
)
|
|
} else if exists && existingAlert.Status == AlertStatusFiring {
|
|
// Alert still firing, update value
|
|
existingAlert.Value = value
|
|
}
|
|
} else {
|
|
// Threshold not exceeded
|
|
if exists && existingAlert.Status == AlertStatusFiring {
|
|
// Alert resolved
|
|
existingAlert.Status = AlertStatusResolved
|
|
existingAlert.ResolvedAt = time.Now()
|
|
existingAlert.Message = fmt.Sprintf("%s: value %.2f is below threshold %.2f", rule.Description, value, rule.Threshold)
|
|
|
|
s.logger.Info("Alert resolved",
|
|
zap.String("rule", rule.Name),
|
|
zap.Float64("value", value),
|
|
zap.Float64("threshold", rule.Threshold),
|
|
)
|
|
}
|
|
}
|
|
s.mu.Unlock()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// evaluateQuery evaluates a Prometheus query and returns the value
|
|
func (s *MonitoringAlertingService) evaluateQuery(ctx context.Context, query string) (float64, error) {
|
|
result, warnings, err := s.prometheusClient.Query(ctx, query, time.Now())
|
|
if err != nil {
|
|
return 0, fmt.Errorf("query failed: %w", err)
|
|
}
|
|
|
|
if len(warnings) > 0 {
|
|
s.logger.Warn("Prometheus query warnings", zap.Strings("warnings", warnings))
|
|
}
|
|
|
|
if result.Type() != model.ValVector {
|
|
return 0, fmt.Errorf("unexpected result type: %v", result.Type())
|
|
}
|
|
|
|
vector := result.(model.Vector)
|
|
if len(vector) == 0 {
|
|
return 0, fmt.Errorf("query returned no results")
|
|
}
|
|
|
|
// Return the first value
|
|
return float64(vector[0].Value), nil
|
|
}
|
|
|
|
// StartMonitoring starts continuous monitoring with periodic alert checks
|
|
func (s *MonitoringAlertingService) StartMonitoring(ctx context.Context, interval time.Duration) error {
|
|
if interval <= 0 {
|
|
interval = 30 * time.Second // Default interval
|
|
}
|
|
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
s.logger.Info("Starting monitoring",
|
|
zap.Duration("interval", interval),
|
|
zap.Int("rules_count", len(s.rules)),
|
|
)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
s.logger.Info("Monitoring stopped")
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
if err := s.CheckAlerts(ctx); err != nil {
|
|
s.logger.Error("Failed to check alerts", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetAlertByRuleName returns an alert by rule name
|
|
func (s *MonitoringAlertingService) GetAlertByRuleName(ruleName string) (*MonitoringAlert, bool) {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
alert, exists := s.activeAlerts[ruleName]
|
|
return alert, exists
|
|
}
|
|
|
|
// ResolveAlert manually resolves an alert
|
|
func (s *MonitoringAlertingService) ResolveAlert(ruleName string) error {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
alert, exists := s.activeAlerts[ruleName]
|
|
if !exists {
|
|
return fmt.Errorf("alert not found: %s", ruleName)
|
|
}
|
|
|
|
if alert.Status == AlertStatusResolved {
|
|
return nil // Already resolved
|
|
}
|
|
|
|
alert.Status = AlertStatusResolved
|
|
alert.ResolvedAt = time.Now()
|
|
alert.Message = "Manually resolved"
|
|
|
|
s.logger.Info("Alert manually resolved",
|
|
zap.String("rule", ruleName),
|
|
)
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetDefaultAlertRules returns a set of default alert rules for common metrics
|
|
func GetDefaultAlertRules() []AlertRule {
|
|
return []AlertRule{
|
|
{
|
|
Name: "high_error_rate",
|
|
Query: "rate(veza_errors_total[5m]) > 0.1",
|
|
Threshold: 0.1,
|
|
Severity: SeverityCritical,
|
|
Duration: 5 * time.Minute,
|
|
Description: "High error rate detected",
|
|
Enabled: true,
|
|
},
|
|
{
|
|
Name: "high_response_time",
|
|
Query: "histogram_quantile(0.95, rate(veza_http_request_duration_seconds_bucket[5m])) > 1.0",
|
|
Threshold: 1.0,
|
|
Severity: SeverityWarning,
|
|
Duration: 5 * time.Minute,
|
|
Description: "High response time detected",
|
|
Enabled: true,
|
|
},
|
|
{
|
|
Name: "database_connection_pool_exhausted",
|
|
Query: "veza_database_connections_active / veza_database_connections_max > 0.9",
|
|
Threshold: 0.9,
|
|
Severity: SeverityCritical,
|
|
Duration: 2 * time.Minute,
|
|
Description: "Database connection pool nearly exhausted",
|
|
Enabled: true,
|
|
},
|
|
{
|
|
Name: "high_memory_usage",
|
|
Query: "process_resident_memory_bytes / 1024 / 1024 / 1024 > 2",
|
|
Threshold: 2.0,
|
|
Severity: SeverityWarning,
|
|
Duration: 5 * time.Minute,
|
|
Description: "High memory usage detected",
|
|
Enabled: true,
|
|
},
|
|
}
|
|
}
|