veza/veza-backend-api/internal/utils/sanitizer.go

package utils

import (
	"html"
	"regexp"
	"strings"
	"unicode"
)

// BE-SEC-009: Input sanitization to prevent XSS and injection attacks

// SanitizeInput sanitizes user input to prevent XSS and injection attacks
// It performs the following operations:
// 1. HTML escape special characters
// 2. Remove control characters (except newlines and tabs)
// 3. Trim whitespace
// 4. Remove dangerous URL schemes (javascript:, data:, vbscript:, etc.)
// 5. Limit length to prevent DoS
func SanitizeInput(input string, maxLength int) string {
	if input == "" {
		return ""
	}

	// Default max length if not specified
	if maxLength <= 0 {
		maxLength = 10000
	}

	// Step 1: HTML escape to prevent XSS
	cleaned := html.EscapeString(input)

	// Step 2: Remove dangerous URL schemes (case-insensitive)
	dangerousSchemes := regexp.MustCompile(`(?i)(javascript|data|vbscript|file|about):`)
	cleaned = dangerousSchemes.ReplaceAllString(cleaned, "")

	// Step 3: Remove control characters except newline (\n), carriage return (\r), and tab (\t)
	cleaned = strings.Map(func(r rune) rune {
		if r == '\n' || r == '\r' || r == '\t' {
			return r
		}
		if unicode.IsControl(r) {
			return -1
		}
		return r
	}, cleaned)

	// Step 4: Trim whitespace
	cleaned = strings.TrimSpace(cleaned)

	// Step 5: Limit length
	if len(cleaned) > maxLength {
		cleaned = cleaned[:maxLength]
	}

	return cleaned
}

// SanitizeText sanitizes text input (for usernames, titles, descriptions, etc.)
// More permissive than SanitizeInput - allows more characters but still prevents XSS
func SanitizeText(input string, maxLength int) string {
	if input == "" {
		return ""
	}

	if maxLength <= 0 {
		maxLength = 5000
	}

	// HTML escape to prevent XSS
	cleaned := html.EscapeString(input)

	// Remove dangerous URL schemes
	dangerousSchemes := regexp.MustCompile(`(?i)(javascript|data|vbscript|file|about):`)
	cleaned = dangerousSchemes.ReplaceAllString(cleaned, "")

	// Remove null bytes and other dangerous control characters
	cleaned = strings.ReplaceAll(cleaned, "\x00", "")
	cleaned = strings.ReplaceAll(cleaned, "\x1a", "") // SUB character

	// Trim whitespace
	cleaned = strings.TrimSpace(cleaned)

	// Limit length
	if len(cleaned) > maxLength {
		cleaned = cleaned[:maxLength]
	}

	return cleaned
}

// SanitizeHTML sanitizes HTML content by removing dangerous tags and attributes
// This is more aggressive than SanitizeText and should be used for HTML content
func SanitizeHTML(input string, maxLength int) string {
	if input == "" {
		return ""
	}

	if maxLength <= 0 {
		maxLength = 50000
	}

	// Remove script tags and their content
	scriptPattern := regexp.MustCompile(`(?i)<script[^>]*>.*?</script>`)
	cleaned := scriptPattern.ReplaceAllString(input, "")

	// Remove iframe tags
	iframePattern := regexp.MustCompile(`(?i)<iframe[^>]*>.*?</iframe>`)
	cleaned = iframePattern.ReplaceAllString(cleaned, "")

	// Remove object and embed tags
	objectPattern := regexp.MustCompile(`(?i)<(object|embed)[^>]*>.*?</\1>`)
	cleaned = objectPattern.ReplaceAllString(cleaned, "")

	// Remove dangerous event handlers (onclick, onerror, etc.)
	eventHandlerPattern := regexp.MustCompile(`(?i)\s*on\w+\s*=\s*["'][^"']*["']`)
	cleaned = eventHandlerPattern.ReplaceAllString(cleaned, "")

	// Remove dangerous URL schemes in href/src attributes
	dangerousSchemes := regexp.MustCompile(`(?i)(href|src)\s*=\s*["'](javascript|data|vbscript|file|about):[^"']*["']`)
	cleaned = dangerousSchemes.ReplaceAllString(cleaned, "")

	// Remove style tags with potentially dangerous content
	stylePattern := regexp.MustCompile(`(?i)<style[^>]*>.*?</style>`)
	cleaned = stylePattern.ReplaceAllString(cleaned, "")

	// Limit length
	if len(cleaned) > maxLength {
		cleaned = cleaned[:maxLength]
	}

	return cleaned
}

// SanitizeURL sanitizes a URL to prevent XSS and injection
func SanitizeURL(input string) string {
	if input == "" {
		return ""
	}

	// Trim whitespace
	cleaned := strings.TrimSpace(input)

	// Remove dangerous URL schemes
	dangerousSchemes := regexp.MustCompile(`(?i)^(javascript|data|vbscript|file|about):`)
	cleaned = dangerousSchemes.ReplaceAllString(cleaned, "")

	// Remove null bytes
	cleaned = strings.ReplaceAll(cleaned, "\x00", "")

	// Limit length
	if len(cleaned) > 2048 {
		cleaned = cleaned[:2048]
	}

	return cleaned
}

// SanitizeEmail sanitizes an email address
func SanitizeEmail(input string) string {
	if input == "" {
		return ""
	}

	// Trim whitespace and convert to lowercase
	cleaned := strings.TrimSpace(strings.ToLower(input))

	// Remove control characters
	cleaned = strings.Map(func(r rune) rune {
		if unicode.IsControl(r) {
			return -1
		}
		return r
	}, cleaned)

	// Limit length (RFC 5321: 320 characters max for email)
	if len(cleaned) > 320 {
		cleaned = cleaned[:320]
	}

	return cleaned
}

// SanitizeUsername sanitizes a username
func SanitizeUsername(input string) string {
	if input == "" {
		return ""
	}

	// Trim whitespace
	cleaned := strings.TrimSpace(input)

	// Remove HTML tags
	htmlTagPattern := regexp.MustCompile(`<[^>]*>`)
	cleaned = htmlTagPattern.ReplaceAllString(cleaned, "")

	// Remove control characters
	cleaned = strings.Map(func(r rune) rune {
		if unicode.IsControl(r) {
			return -1
		}
		return r
	}, cleaned)

	// Limit length
	if len(cleaned) > 50 {
		cleaned = cleaned[:50]
	}

	return cleaned
}