talas-group/talas-wiki/internal/wiki/intelligence.go

package wiki

import (
	"sort"
	"strings"
	"time"
)

// LinkSuggestion represents a page mentioned but not linked
type LinkSuggestion struct {
	Source     string // page that mentions the target
	Target     string // page URL path that could be linked
	TargetTitle string
	Mentions   int    // how many times the name appears
}

// SimilarPage represents a page similar to another
type SimilarPage struct {
	Page       *Page
	Similarity float64
}

// DuplicateGroup groups pages with very similar content
type DuplicateGroup struct {
	Pages      []*Page
	Similarity float64
}

// DomainDep shows how many links connect two domains
type DomainDep struct {
	Source string `json:"source"`
	Target string `json:"target"`
	Weight int    `json:"weight"`
}

// TimelineEntry represents activity on a single day
type TimelineEntry struct {
	Date    string `json:"date"` // "2006-01-02"
	Count   int    `json:"count"`
	Pages   []string `json:"pages,omitempty"`
}

// GetLinkSuggestions finds pages that mention other page names without linking to them
func (idx *Index) GetLinkSuggestions(maxResults int) []LinkSuggestion {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	var suggestions []LinkSuggestion

	for _, page := range idx.allPages {
		contentLower := idx.contentCache[page.URLPath]
		if contentLower == "" {
			continue
		}

		// Check which pages are mentioned by name but not linked
		outlinked := make(map[string]bool)
		for _, target := range idx.outlinks[page.URLPath] {
			outlinked[target] = true
		}

		for _, candidate := range idx.allPages {
			if candidate.URLPath == page.URLPath {
				continue
			}
			if outlinked[candidate.URLPath] {
				continue
			}

			nameLower := strings.ToLower(candidate.Name)
			if len(nameLower) < 4 {
				continue // skip very short names (README etc)
			}

			// Count mentions of the candidate name
			count := strings.Count(contentLower, nameLower)
			if count > 0 {
				suggestions = append(suggestions, LinkSuggestion{
					Source:      page.URLPath,
					Target:      candidate.URLPath,
					TargetTitle: candidate.Title,
					Mentions:    count,
				})
			}
		}
	}

	sort.Slice(suggestions, func(i, j int) bool {
		return suggestions[i].Mentions > suggestions[j].Mentions
	})

	if len(suggestions) > maxResults {
		suggestions = suggestions[:maxResults]
	}
	return suggestions
}

// GetSimilarPages finds pages with content similar to the given page
func (idx *Index) GetSimilarPages(urlPath string, maxResults int) []SimilarPage {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	source := idx.contentCache[urlPath]
	if source == "" {
		return nil
	}

	sourceTrigrams := makeTrigrams(source[:min(len(source), 2000)])

	var results []SimilarPage
	for _, page := range idx.allPages {
		if page.URLPath == urlPath {
			continue
		}
		content := idx.contentCache[page.URLPath]
		if content == "" {
			continue
		}

		targetTrigrams := makeTrigrams(content[:min(len(content), 2000)])

		common := 0
		for t := range sourceTrigrams {
			if targetTrigrams[t] {
				common++
			}
		}
		total := len(sourceTrigrams)
		if len(targetTrigrams) > total {
			total = len(targetTrigrams)
		}
		if total == 0 {
			continue
		}
		sim := float64(common) / float64(total)
		if sim > 0.25 {
			results = append(results, SimilarPage{Page: page, Similarity: sim})
		}
	}

	sort.Slice(results, func(i, j int) bool {
		return results[i].Similarity > results[j].Similarity
	})

	if len(results) > maxResults {
		results = results[:maxResults]
	}
	return results
}

// GetDuplicates finds groups of pages with very similar content
func (idx *Index) GetDuplicates(threshold float64) []DuplicateGroup {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	type pair struct {
		a, b *Page
		sim  float64
	}
	var pairs []pair

	pages := idx.allPages
	for i := 0; i < len(pages); i++ {
		contentA := idx.contentCache[pages[i].URLPath]
		if len(contentA) < 100 {
			continue
		}
		triA := makeTrigrams(contentA[:min(len(contentA), 1500)])

		for j := i + 1; j < len(pages); j++ {
			contentB := idx.contentCache[pages[j].URLPath]
			if len(contentB) < 100 {
				continue
			}
			triB := makeTrigrams(contentB[:min(len(contentB), 1500)])

			common := 0
			for t := range triA {
				if triB[t] {
					common++
				}
			}
			total := len(triA)
			if len(triB) > total {
				total = len(triB)
			}
			if total > 0 {
				sim := float64(common) / float64(total)
				if sim >= threshold {
					pairs = append(pairs, pair{pages[i], pages[j], sim})
				}
			}
		}
	}

	// Simple grouping: each pair becomes a group
	var groups []DuplicateGroup
	for _, p := range pairs {
		groups = append(groups, DuplicateGroup{
			Pages:      []*Page{p.a, p.b},
			Similarity: p.sim,
		})
	}

	sort.Slice(groups, func(i, j int) bool {
		return groups[i].Similarity > groups[j].Similarity
	})

	if len(groups) > 20 {
		groups = groups[:20]
	}
	return groups
}

// GetDomainDeps returns inter-domain link counts
func (idx *Index) GetDomainDeps() []DomainDep {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	counts := make(map[string]int) // "source|target" → count

	for source, targets := range idx.outlinks {
		srcPage := idx.byFullPath[source]
		if srcPage == nil || srcPage.Domain == "" {
			continue
		}
		for _, target := range targets {
			tgtPage := idx.byFullPath[target]
			if tgtPage == nil || tgtPage.Domain == "" || tgtPage.Domain == srcPage.Domain {
				continue
			}
			key := srcPage.Domain + "|" + tgtPage.Domain
			counts[key]++
		}
	}

	var deps []DomainDep
	for key, count := range counts {
		parts := strings.SplitN(key, "|", 2)
		deps = append(deps, DomainDep{Source: parts[0], Target: parts[1], Weight: count})
	}

	sort.Slice(deps, func(i, j int) bool {
		return deps[i].Weight > deps[j].Weight
	})
	return deps
}

// GetTimeline returns modification activity per day for the last N days
func (idx *Index) GetTimeline(days int) []TimelineEntry {
	idx.mu.RLock()
	defer idx.mu.RUnlock()

	now := time.Now()
	dayMap := make(map[string][]string)

	for _, page := range idx.allPages {
		dateStr := page.ModTime.Format("2006-01-02")
		if now.Sub(page.ModTime).Hours() < float64(days*24) {
			dayMap[dateStr] = append(dayMap[dateStr], page.Title)
		}
	}

	var entries []TimelineEntry
	for d := 0; d < days; d++ {
		date := now.AddDate(0, 0, -d)
		dateStr := date.Format("2006-01-02")
		pages := dayMap[dateStr]
		entries = append(entries, TimelineEntry{
			Date:  dateStr,
			Count: len(pages),
			Pages: pages,
		})
	}
	return entries
}

// GetPageSummary returns the first meaningful non-heading line as a summary
func (idx *Index) GetPageSummary(urlPath string) string {
	content := idx.GetContentForSearch(urlPath)
	if content == "" {
		return ""
	}
	lines := strings.Split(content, "\n")
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" || strings.HasPrefix(line, "#") || strings.HasPrefix(line, "---") ||
			strings.Contains(line, "::") || strings.HasPrefix(line, ">") || strings.HasPrefix(line, "|") {
			continue
		}
		if len(line) > 160 {
			return line[:160] + "..."
		}
		return line
	}
	return ""
}