talas-group/talas-wiki/internal/wiki/indexer.go

package wiki

import (
	"bufio"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
	"sync"
	"time"

	"github.com/fsnotify/fsnotify"
)

type Page struct {
	RelPath string    // e.g. "01_PILOTAGE/CALENDRIER_GENERAL.md"
	URLPath string    // e.g. "01_PILOTAGE/CALENDRIER_GENERAL"
	AbsPath string
	Title   string
	Domain  string    // e.g. "01_PILOTAGE"
	Name    string    // e.g. "CALENDRIER_GENERAL"
	Tags    []string
	ModTime time.Time
	Size    int64
}

type DirEntry struct {
	Name    string
	URLPath string
	IsDir   bool
	ModTime time.Time
	Size    int64
}

type Domain struct {
	Number  string
	Name    string
	FullDir string
	Desc    string
	Count   int
	Color   string
}

var DomainColors = map[string]string{
	"00": "#bf616a", "01": "#d08770", "02": "#ebcb8b", "03": "#a3be8c",
	"04": "#88c0d0", "05": "#5e81ac", "06": "#b48ead", "07": "#bf616a",
	"08": "#d08770", "09": "#ebcb8b", "10": "#a3be8c", "11": "#88c0d0",
	"12": "#5e81ac",
}

type BrokenLink struct {
	Source     string
	Target     string
	Suggestion string // best fuzzy match URL path
}

type GraphNode struct {
	ID     string `json:"id"`
	Title  string `json:"title"`
	Domain string `json:"domain"`
	Size   int    `json:"size"` // number of links (incoming + outgoing)
}

type GraphLink struct {
	Source string `json:"source"`
	Target string `json:"target"`
}

type GraphData struct {
	Nodes []GraphNode `json:"nodes"`
	Links []GraphLink `json:"links"`
}

type DomainHealth struct {
	Domain     string
	Color      string
	PctLinked  int
	PctTagged  int
	PctRecent  int
	AvgLinks   float64
	TotalPages int
}

type Index struct {
	mu           sync.RWMutex
	docsRoot     string
	excludes     map[string]bool
	byFullPath   map[string]*Page
	byName       map[string][]*Page
	byTag        map[string][]*Page
	backlinks    map[string][]string
	outlinks     map[string][]string
	contentCache map[string]string
	brokenLinks  []BrokenLink
	domains      []Domain
	allPages     []*Page
	allTags      []string
	// Precomputed analytics (computed async after Rebuild)
	cachedLinkSuggestions []LinkSuggestion
	cachedDuplicates      []DuplicateGroup
	cachedDomainHealth    []DomainHealth
	analyticsReady        bool
}

var wikilinkRe = regexp.MustCompile(`\[\[([^\]]+)\]\]`)

func NewIndex(docsRoot string, excludeDirs []string) *Index {
	excludes := make(map[string]bool, len(excludeDirs))
	for _, d := range excludeDirs {
		excludes[d] = true
	}
	idx := &Index{
		docsRoot: docsRoot,
		excludes: excludes,
	}
	idx.Rebuild()
	return idx
}

func (idx *Index) GetDocsRoot() string {
	return idx.docsRoot
}

func (idx *Index) isExcluded(relPath string) bool {
	for exc := range idx.excludes {
		if relPath == exc || strings.HasPrefix(relPath, exc+"/") {
			return true
		}
	}
	parts := strings.Split(relPath, "/")
	for _, p := range parts {
		if strings.HasPrefix(p, ".") {
			return true
		}
	}
	return false
}

func (idx *Index) Rebuild() {
	byFullPath := make(map[string]*Page)
	byName := make(map[string][]*Page)
	byTag := make(map[string][]*Page)
	backlinks := make(map[string][]string)
	outlinks := make(map[string][]string)
	var brokenLinks []BrokenLink
	var allPages []*Page
	domainMap := make(map[string]*Domain)
	tagSet := make(map[string]bool)

	filepath.Walk(idx.docsRoot, func(path string, info os.FileInfo, err error) error {
		if err != nil {
			return nil
		}
		relPath, _ := filepath.Rel(idx.docsRoot, path)
		if relPath == "." {
			return nil
		}
		if idx.isExcluded(relPath) {
			if info.IsDir() {
				return filepath.SkipDir
			}
			return nil
		}
		if info.IsDir() || !strings.HasSuffix(relPath, ".md") {
			return nil
		}

		urlPath := strings.TrimSuffix(relPath, ".md")
		name := strings.TrimSuffix(filepath.Base(relPath), ".md")

		domain := ""
		parts := strings.SplitN(relPath, "/", 2)
		if len(parts) > 1 {
			domain = parts[0]
		}

		title, tags := extractTitleAndTags(path, name)

		page := &Page{
			RelPath: relPath,
			URLPath: urlPath,
			AbsPath: path,
			Title:   title,
			Domain:  domain,
			Name:    name,
			Tags:    tags,
			ModTime: info.ModTime(),
			Size:    info.Size(),
		}

		byFullPath[urlPath] = page
		byName[name] = append(byName[name], page)
		allPages = append(allPages, page)

		for _, tag := range tags {
			tagLower := strings.ToLower(tag)
			byTag[tagLower] = append(byTag[tagLower], page)
			tagSet[tagLower] = true
		}

		if domain != "" {
			if _, ok := domainMap[domain]; !ok {
				num := ""
				dname := domain
				if i := strings.Index(domain, "_"); i >= 0 {
					num = domain[:i]
					dname = domain[i+1:]
				}
				domainMap[domain] = &Domain{
					Number:  num,
					Name:    dname,
					FullDir: domain,
					Color:   DomainColors[num],
				}
			}
			domainMap[domain].Count++
		}

		return nil
	})

	// Build link maps + content cache
	contentCache := make(map[string]string, len(allPages))
	for _, page := range allPages {
		content, err := os.ReadFile(page.AbsPath)
		if err != nil {
			continue
		}
		contentCache[page.URLPath] = strings.ToLower(string(content))
		matches := wikilinkRe.FindAllStringSubmatch(string(content), -1)
		seen := make(map[string]bool)
		for _, m := range matches {
			target := m[1]
			if pipeIdx := strings.Index(target, "|"); pipeIdx >= 0 {
				target = target[:pipeIdx]
			}
			target = strings.TrimSpace(target)
			if target == "" || strings.HasPrefix(target, "!") {
				continue
			}

			resolved := resolveTarget(target, byFullPath, byName)
			if resolved != "" {
				if !seen[resolved] {
					backlinks[resolved] = append(backlinks[resolved], page.URLPath)
					outlinks[page.URLPath] = append(outlinks[page.URLPath], resolved)
					seen[resolved] = true
				}
			} else if !isAssetTarget(target) {
				brokenLinks = append(brokenLinks, BrokenLink{
					Source: page.URLPath,
					Target: target,
				})
			}
		}
	}

	// Domain descriptions
	for key, dom := range domainMap {
		readmePath := filepath.Join(idx.docsRoot, dom.FullDir, "README.md")
		dom.Desc = extractFirstLine(readmePath)
		domainMap[key] = dom
	}

	var domains []Domain
	for _, dom := range domainMap {
		domains = append(domains, *dom)
	}
	sort.Slice(domains, func(i, j int) bool {
		return domains[i].Number < domains[j].Number
	})

	var allTags []string
	for tag := range tagSet {
		allTags = append(allTags, tag)
	}
	sort.Strings(allTags)

	// Compute broken link suggestions using fuzzy matching
	for i := range brokenLinks {
		bestScore := 0
		bestMatch := ""
		targetLower := strings.ToLower(brokenLinks[i].Target)
		for _, page := range allPages {
			nameLower := strings.ToLower(page.Name)
			score := 0
			if strings.Contains(nameLower, targetLower) || strings.Contains(targetLower, nameLower) {
				score = 10
			} else {
				triA := makeTrigrams(targetLower)
				triB := makeTrigrams(nameLower)
				common := 0
				for t := range triA {
					if triB[t] { common++ }
				}
				total := len(triA)
				if len(triB) > total { total = len(triB) }
				if total > 0 && float64(common)/float64(total) > 0.3 {
					score = common
				}
			}
			if score > bestScore {
				bestScore = score
				bestMatch = page.URLPath
			}
		}
		brokenLinks[i].Suggestion = bestMatch
	}

	idx.mu.Lock()
	idx.byFullPath = byFullPath
	idx.byName = byName
	idx.byTag = byTag
	idx.backlinks = backlinks
	idx.outlinks = outlinks
	idx.contentCache = contentCache
	idx.brokenLinks = brokenLinks
	idx.domains = domains
	idx.allPages = allPages
	idx.allTags = allTags
	idx.analyticsReady = false
	idx.mu.Unlock()

	// Compute heavy analytics asynchronously
	go idx.computeAnalytics()
}

func isAssetTarget(target string) bool {
	exts := []string{".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ods", ".xlsx", ".zip", ".mp3", ".wav", ".html"}
	lower := strings.ToLower(target)
	for _, ext := range exts {
		if strings.HasSuffix(lower, ext) {
			return true
		}
	}
	return false
}

func resolveTarget(target string, byFullPath map[string]*Page, byName map[string][]*Page) string {
	target = strings.TrimSuffix(target, ".md")
	if p, ok := byFullPath[target]; ok {
		return p.URLPath
	}
	name := filepath.Base(target)
	if pages, ok := byName[name]; ok && len(pages) > 0 {
		return pages[0].URLPath
	}
	return ""
}

func extractTitleAndTags(path string, fallback string) (string, []string) {
	f, err := os.Open(path)
	if err != nil {
		return fallback, nil
	}
	defer f.Close()

	title := fallback
	var tags []string
	scanner := bufio.NewScanner(f)
	foundTitle := false

	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())

		// Extract tags from Logseq properties
		if strings.HasPrefix(line, "tags::") {
			tagStr := strings.TrimPrefix(line, "tags::")
			tagStr = strings.TrimSpace(tagStr)
			for _, t := range strings.Split(tagStr, ",") {
				t = strings.TrimSpace(t)
				if t != "" {
					tags = append(tags, t)
				}
			}
			continue
		}

		// Skip other Logseq properties
		if strings.Contains(line, ":: ") && !strings.HasPrefix(line, "#") {
			continue
		}

		if !foundTitle {
			if strings.HasPrefix(line, "# ") {
				title = strings.TrimPrefix(line, "# ")
				foundTitle = true
				continue
			}
			if line != "" && !strings.HasPrefix(line, "---") {
				break
			}
		} else {
			break
		}
	}
	return title, tags
}

func extractFirstLine(path string) string {
	f, err := os.Open(path)
	if err != nil {
		return ""
	}
	defer f.Close()
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line == "" || strings.HasPrefix(line, "#") || strings.HasPrefix(line, "---") || strings.Contains(line, "::") {
			continue
		}
		if len(line) > 120 {
			return line[:120] + "..."
		}
		return line
	}
	return ""
}

// Public accessors

func (idx *Index) GetPage(urlPath string) *Page {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.byFullPath[urlPath]
}

func (idx *Index) ResolveLinkTarget(target string) string {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return resolveTarget(target, idx.byFullPath, idx.byName)
}

func (idx *Index) GetBacklinks(urlPath string) []string {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.backlinks[urlPath]
}

func (idx *Index) GetDomains() []Domain {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.domains
}

func (idx *Index) GetAllPages() []*Page {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.allPages
}

func (idx *Index) GetAllTags() []string {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.allTags
}

func (idx *Index) GetPagesByTag(tag string) []*Page {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.byTag[strings.ToLower(tag)]
}

func (idx *Index) GetOrphanPages() []*Page {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	var orphans []*Page
	for _, page := range idx.allPages {
		if len(idx.backlinks[page.URLPath]) == 0 && page.Name != "README" {
			orphans = append(orphans, page)
		}
	}
	return orphans
}

func (idx *Index) GetBrokenLinks() []BrokenLink {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.brokenLinks
}

func (idx *Index) GetMostLinked(n int) []*Page {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	type scored struct {
		page  *Page
		count int
	}
	var items []scored
	for _, page := range idx.allPages {
		count := len(idx.backlinks[page.URLPath])
		if count > 0 {
			items = append(items, scored{page, count})
		}
	}
	sort.Slice(items, func(i, j int) bool {
		return items[i].count > items[j].count
	})
	if len(items) > n {
		items = items[:n]
	}
	var result []*Page
	for _, item := range items {
		result = append(result, item.page)
	}
	return result
}

func (idx *Index) GetRecentPages(n int) []*Page {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	sorted := make([]*Page, len(idx.allPages))
	copy(sorted, idx.allPages)
	sort.Slice(sorted, func(i, j int) bool {
		return sorted[i].ModTime.After(sorted[j].ModTime)
	})
	if len(sorted) > n {
		sorted = sorted[:n]
	}
	return sorted
}

func (idx *Index) GetGraphData() GraphData {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	nodeMap := make(map[string]bool)
	var nodes []GraphNode
	var links []GraphLink

	// Add all pages as nodes
	for _, page := range idx.allPages {
		size := len(idx.backlinks[page.URLPath]) + len(idx.outlinks[page.URLPath])
		if size == 0 {
			size = 1
		}
		nodes = append(nodes, GraphNode{
			ID:     page.URLPath,
			Title:  page.Title,
			Domain: page.Domain,
			Size:   size,
		})
		nodeMap[page.URLPath] = true
	}

	// Add links
	for source, targets := range idx.outlinks {
		for _, target := range targets {
			if nodeMap[source] && nodeMap[target] {
				links = append(links, GraphLink{
					Source: source,
					Target: target,
				})
			}
		}
	}

	return GraphData{Nodes: nodes, Links: links}
}

func (idx *Index) ListDir(relDir string) ([]DirEntry, error) {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	absDir := filepath.Join(idx.docsRoot, relDir)
	entries, err := os.ReadDir(absDir)
	if err != nil {
		return nil, err
	}

	var result []DirEntry
	for _, e := range entries {
		name := e.Name()
		entryRel := filepath.Join(relDir, name)
		if idx.isExcluded(entryRel) || strings.HasPrefix(name, ".") {
			continue
		}
		info, err := e.Info()
		if err != nil {
			continue
		}
		if e.IsDir() {
			result = append(result, DirEntry{Name: name, URLPath: entryRel, IsDir: true, ModTime: info.ModTime()})
		} else if strings.HasSuffix(name, ".md") {
			result = append(result, DirEntry{Name: name, URLPath: strings.TrimSuffix(entryRel, ".md"), IsDir: false, ModTime: info.ModTime(), Size: info.Size()})
		} else {
			result = append(result, DirEntry{Name: name, URLPath: entryRel, IsDir: false, ModTime: info.ModTime(), Size: info.Size()})
		}
	}
	return result, nil
}

func (idx *Index) Watch(onChange func(string)) {
	watcher, err := fsnotify.NewWatcher()
	if err != nil {
		return
	}
	go func() {
		var debounce *time.Timer
		for {
			select {
			case event, ok := <-watcher.Events:
				if !ok {
					return
				}
				if event.Has(fsnotify.Create) || event.Has(fsnotify.Write) || event.Has(fsnotify.Remove) || event.Has(fsnotify.Rename) {
					changedPath := ""
					if rel, err := filepath.Rel(idx.docsRoot, event.Name); err == nil {
						changedPath = strings.TrimSuffix(rel, ".md")
					}
					if debounce != nil {
						debounce.Stop()
					}
					debounce = time.AfterFunc(500*time.Millisecond, func() {
						idx.Rebuild()
						if onChange != nil {
							onChange(changedPath)
						}
					})
				}
			case _, ok := <-watcher.Errors:
				if !ok {
					return
				}
			}
		}
	}()
	filepath.Walk(idx.docsRoot, func(path string, info os.FileInfo, err error) error {
		if err != nil || !info.IsDir() {
			return nil
		}
		relPath, _ := filepath.Rel(idx.docsRoot, path)
		if relPath != "." && idx.isExcluded(relPath) {
			return filepath.SkipDir
		}
		watcher.Add(path)
		return nil
	})
}

func makeTrigrams(s string) map[string]bool {
	result := make(map[string]bool)
	runes := []rune(s)
	for i := 0; i+2 < len(runes); i++ {
		result[string(runes[i:i+3])] = true
	}
	return result
}

func (idx *Index) GetContentForSearch(urlPath string) string {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.contentCache[urlPath]
}

func (idx *Index) GetDomainHealth() []DomainHealth {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.cachedDomainHealth
}

func (idx *Index) GetCachedLinkSuggestions() []LinkSuggestion {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.cachedLinkSuggestions
}

func (idx *Index) GetCachedDuplicates() []DuplicateGroup {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.cachedDuplicates
}

func (idx *Index) IsAnalyticsReady() bool {
	idx.mu.RLock()
	defer idx.mu.RUnlock()
	return idx.analyticsReady
}

func (idx *Index) computeAnalytics() {
	// Domain health (fast, O(n))
	health := idx.computeDomainHealth()

	// Link suggestions (slow, O(n²) — but we're in a goroutine)
	suggestions := idx.GetLinkSuggestions(30)

	// Duplicates (slow, O(n²))
	duplicates := idx.GetDuplicates(0.6)

	idx.mu.Lock()
	idx.cachedDomainHealth = health
	idx.cachedLinkSuggestions = suggestions
	idx.cachedDuplicates = duplicates
	idx.analyticsReady = true
	idx.mu.Unlock()
}

func (idx *Index) computeDomainHealth() []DomainHealth {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	now := time.Now()
	thirtyDaysAgo := now.AddDate(0, 0, -30)
	healthMap := make(map[string]*DomainHealth)

	for _, dom := range idx.domains {
		healthMap[dom.FullDir] = &DomainHealth{
			Domain: dom.FullDir, Color: dom.Color, TotalPages: dom.Count,
		}
	}

	for _, page := range idx.allPages {
		h, ok := healthMap[page.Domain]
		if !ok {
			continue
		}
		if len(idx.backlinks[page.URLPath]) > 0 { h.PctLinked++ }
		if len(page.Tags) > 0 { h.PctTagged++ }
		if page.ModTime.After(thirtyDaysAgo) { h.PctRecent++ }
		h.AvgLinks += float64(len(idx.outlinks[page.URLPath]))
	}

	var result []DomainHealth
	for _, dom := range idx.domains {
		h := healthMap[dom.FullDir]
		if h.TotalPages > 0 {
			h.PctLinked = h.PctLinked * 100 / h.TotalPages
			h.PctTagged = h.PctTagged * 100 / h.TotalPages
			h.PctRecent = h.PctRecent * 100 / h.TotalPages
			h.AvgLinks = h.AvgLinks / float64(h.TotalPages)
		}
		result = append(result, *h)
	}
	return result
}