talas-group/talas-wiki/internal/wiki/indexer.go
senke 66471934af Initial commit: Talas Group project management & documentation
Knowledge base of ~80+ markdown files across 14 domains (00-13),
Logseq graph, hardware design files (KiCAD), infrastructure configs,
and talas-wiki static site.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 20:10:41 +02:00

738 lines
17 KiB
Go

package wiki
import (
"bufio"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"time"
"github.com/fsnotify/fsnotify"
)
type Page struct {
RelPath string // e.g. "01_PILOTAGE/CALENDRIER_GENERAL.md"
URLPath string // e.g. "01_PILOTAGE/CALENDRIER_GENERAL"
AbsPath string
Title string
Domain string // e.g. "01_PILOTAGE"
Name string // e.g. "CALENDRIER_GENERAL"
Tags []string
ModTime time.Time
Size int64
}
type DirEntry struct {
Name string
URLPath string
IsDir bool
ModTime time.Time
Size int64
}
type Domain struct {
Number string
Name string
FullDir string
Desc string
Count int
Color string
}
var DomainColors = map[string]string{
"00": "#bf616a", "01": "#d08770", "02": "#ebcb8b", "03": "#a3be8c",
"04": "#88c0d0", "05": "#5e81ac", "06": "#b48ead", "07": "#bf616a",
"08": "#d08770", "09": "#ebcb8b", "10": "#a3be8c", "11": "#88c0d0",
"12": "#5e81ac",
}
type BrokenLink struct {
Source string
Target string
Suggestion string // best fuzzy match URL path
}
type GraphNode struct {
ID string `json:"id"`
Title string `json:"title"`
Domain string `json:"domain"`
Size int `json:"size"` // number of links (incoming + outgoing)
}
type GraphLink struct {
Source string `json:"source"`
Target string `json:"target"`
}
type GraphData struct {
Nodes []GraphNode `json:"nodes"`
Links []GraphLink `json:"links"`
}
type DomainHealth struct {
Domain string
Color string
PctLinked int
PctTagged int
PctRecent int
AvgLinks float64
TotalPages int
}
type Index struct {
mu sync.RWMutex
docsRoot string
excludes map[string]bool
byFullPath map[string]*Page
byName map[string][]*Page
byTag map[string][]*Page
backlinks map[string][]string
outlinks map[string][]string
contentCache map[string]string
brokenLinks []BrokenLink
domains []Domain
allPages []*Page
allTags []string
// Precomputed analytics (computed async after Rebuild)
cachedLinkSuggestions []LinkSuggestion
cachedDuplicates []DuplicateGroup
cachedDomainHealth []DomainHealth
analyticsReady bool
}
var wikilinkRe = regexp.MustCompile(`\[\[([^\]]+)\]\]`)
func NewIndex(docsRoot string, excludeDirs []string) *Index {
excludes := make(map[string]bool, len(excludeDirs))
for _, d := range excludeDirs {
excludes[d] = true
}
idx := &Index{
docsRoot: docsRoot,
excludes: excludes,
}
idx.Rebuild()
return idx
}
func (idx *Index) GetDocsRoot() string {
return idx.docsRoot
}
func (idx *Index) isExcluded(relPath string) bool {
for exc := range idx.excludes {
if relPath == exc || strings.HasPrefix(relPath, exc+"/") {
return true
}
}
parts := strings.Split(relPath, "/")
for _, p := range parts {
if strings.HasPrefix(p, ".") {
return true
}
}
return false
}
func (idx *Index) Rebuild() {
byFullPath := make(map[string]*Page)
byName := make(map[string][]*Page)
byTag := make(map[string][]*Page)
backlinks := make(map[string][]string)
outlinks := make(map[string][]string)
var brokenLinks []BrokenLink
var allPages []*Page
domainMap := make(map[string]*Domain)
tagSet := make(map[string]bool)
filepath.Walk(idx.docsRoot, func(path string, info os.FileInfo, err error) error {
if err != nil {
return nil
}
relPath, _ := filepath.Rel(idx.docsRoot, path)
if relPath == "." {
return nil
}
if idx.isExcluded(relPath) {
if info.IsDir() {
return filepath.SkipDir
}
return nil
}
if info.IsDir() || !strings.HasSuffix(relPath, ".md") {
return nil
}
urlPath := strings.TrimSuffix(relPath, ".md")
name := strings.TrimSuffix(filepath.Base(relPath), ".md")
domain := ""
parts := strings.SplitN(relPath, "/", 2)
if len(parts) > 1 {
domain = parts[0]
}
title, tags := extractTitleAndTags(path, name)
page := &Page{
RelPath: relPath,
URLPath: urlPath,
AbsPath: path,
Title: title,
Domain: domain,
Name: name,
Tags: tags,
ModTime: info.ModTime(),
Size: info.Size(),
}
byFullPath[urlPath] = page
byName[name] = append(byName[name], page)
allPages = append(allPages, page)
for _, tag := range tags {
tagLower := strings.ToLower(tag)
byTag[tagLower] = append(byTag[tagLower], page)
tagSet[tagLower] = true
}
if domain != "" {
if _, ok := domainMap[domain]; !ok {
num := ""
dname := domain
if i := strings.Index(domain, "_"); i >= 0 {
num = domain[:i]
dname = domain[i+1:]
}
domainMap[domain] = &Domain{
Number: num,
Name: dname,
FullDir: domain,
Color: DomainColors[num],
}
}
domainMap[domain].Count++
}
return nil
})
// Build link maps + content cache
contentCache := make(map[string]string, len(allPages))
for _, page := range allPages {
content, err := os.ReadFile(page.AbsPath)
if err != nil {
continue
}
contentCache[page.URLPath] = strings.ToLower(string(content))
matches := wikilinkRe.FindAllStringSubmatch(string(content), -1)
seen := make(map[string]bool)
for _, m := range matches {
target := m[1]
if pipeIdx := strings.Index(target, "|"); pipeIdx >= 0 {
target = target[:pipeIdx]
}
target = strings.TrimSpace(target)
if target == "" || strings.HasPrefix(target, "!") {
continue
}
resolved := resolveTarget(target, byFullPath, byName)
if resolved != "" {
if !seen[resolved] {
backlinks[resolved] = append(backlinks[resolved], page.URLPath)
outlinks[page.URLPath] = append(outlinks[page.URLPath], resolved)
seen[resolved] = true
}
} else if !isAssetTarget(target) {
brokenLinks = append(brokenLinks, BrokenLink{
Source: page.URLPath,
Target: target,
})
}
}
}
// Domain descriptions
for key, dom := range domainMap {
readmePath := filepath.Join(idx.docsRoot, dom.FullDir, "README.md")
dom.Desc = extractFirstLine(readmePath)
domainMap[key] = dom
}
var domains []Domain
for _, dom := range domainMap {
domains = append(domains, *dom)
}
sort.Slice(domains, func(i, j int) bool {
return domains[i].Number < domains[j].Number
})
var allTags []string
for tag := range tagSet {
allTags = append(allTags, tag)
}
sort.Strings(allTags)
// Compute broken link suggestions using fuzzy matching
for i := range brokenLinks {
bestScore := 0
bestMatch := ""
targetLower := strings.ToLower(brokenLinks[i].Target)
for _, page := range allPages {
nameLower := strings.ToLower(page.Name)
score := 0
if strings.Contains(nameLower, targetLower) || strings.Contains(targetLower, nameLower) {
score = 10
} else {
triA := makeTrigrams(targetLower)
triB := makeTrigrams(nameLower)
common := 0
for t := range triA {
if triB[t] { common++ }
}
total := len(triA)
if len(triB) > total { total = len(triB) }
if total > 0 && float64(common)/float64(total) > 0.3 {
score = common
}
}
if score > bestScore {
bestScore = score
bestMatch = page.URLPath
}
}
brokenLinks[i].Suggestion = bestMatch
}
idx.mu.Lock()
idx.byFullPath = byFullPath
idx.byName = byName
idx.byTag = byTag
idx.backlinks = backlinks
idx.outlinks = outlinks
idx.contentCache = contentCache
idx.brokenLinks = brokenLinks
idx.domains = domains
idx.allPages = allPages
idx.allTags = allTags
idx.analyticsReady = false
idx.mu.Unlock()
// Compute heavy analytics asynchronously
go idx.computeAnalytics()
}
func isAssetTarget(target string) bool {
exts := []string{".pdf", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ods", ".xlsx", ".zip", ".mp3", ".wav", ".html"}
lower := strings.ToLower(target)
for _, ext := range exts {
if strings.HasSuffix(lower, ext) {
return true
}
}
return false
}
func resolveTarget(target string, byFullPath map[string]*Page, byName map[string][]*Page) string {
target = strings.TrimSuffix(target, ".md")
if p, ok := byFullPath[target]; ok {
return p.URLPath
}
name := filepath.Base(target)
if pages, ok := byName[name]; ok && len(pages) > 0 {
return pages[0].URLPath
}
return ""
}
func extractTitleAndTags(path string, fallback string) (string, []string) {
f, err := os.Open(path)
if err != nil {
return fallback, nil
}
defer f.Close()
title := fallback
var tags []string
scanner := bufio.NewScanner(f)
foundTitle := false
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Extract tags from Logseq properties
if strings.HasPrefix(line, "tags::") {
tagStr := strings.TrimPrefix(line, "tags::")
tagStr = strings.TrimSpace(tagStr)
for _, t := range strings.Split(tagStr, ",") {
t = strings.TrimSpace(t)
if t != "" {
tags = append(tags, t)
}
}
continue
}
// Skip other Logseq properties
if strings.Contains(line, ":: ") && !strings.HasPrefix(line, "#") {
continue
}
if !foundTitle {
if strings.HasPrefix(line, "# ") {
title = strings.TrimPrefix(line, "# ")
foundTitle = true
continue
}
if line != "" && !strings.HasPrefix(line, "---") {
break
}
} else {
break
}
}
return title, tags
}
func extractFirstLine(path string) string {
f, err := os.Open(path)
if err != nil {
return ""
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") || strings.HasPrefix(line, "---") || strings.Contains(line, "::") {
continue
}
if len(line) > 120 {
return line[:120] + "..."
}
return line
}
return ""
}
// Public accessors
func (idx *Index) GetPage(urlPath string) *Page {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.byFullPath[urlPath]
}
func (idx *Index) ResolveLinkTarget(target string) string {
idx.mu.RLock()
defer idx.mu.RUnlock()
return resolveTarget(target, idx.byFullPath, idx.byName)
}
func (idx *Index) GetBacklinks(urlPath string) []string {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.backlinks[urlPath]
}
func (idx *Index) GetDomains() []Domain {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.domains
}
func (idx *Index) GetAllPages() []*Page {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.allPages
}
func (idx *Index) GetAllTags() []string {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.allTags
}
func (idx *Index) GetPagesByTag(tag string) []*Page {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.byTag[strings.ToLower(tag)]
}
func (idx *Index) GetOrphanPages() []*Page {
idx.mu.RLock()
defer idx.mu.RUnlock()
var orphans []*Page
for _, page := range idx.allPages {
if len(idx.backlinks[page.URLPath]) == 0 && page.Name != "README" {
orphans = append(orphans, page)
}
}
return orphans
}
func (idx *Index) GetBrokenLinks() []BrokenLink {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.brokenLinks
}
func (idx *Index) GetMostLinked(n int) []*Page {
idx.mu.RLock()
defer idx.mu.RUnlock()
type scored struct {
page *Page
count int
}
var items []scored
for _, page := range idx.allPages {
count := len(idx.backlinks[page.URLPath])
if count > 0 {
items = append(items, scored{page, count})
}
}
sort.Slice(items, func(i, j int) bool {
return items[i].count > items[j].count
})
if len(items) > n {
items = items[:n]
}
var result []*Page
for _, item := range items {
result = append(result, item.page)
}
return result
}
func (idx *Index) GetRecentPages(n int) []*Page {
idx.mu.RLock()
defer idx.mu.RUnlock()
sorted := make([]*Page, len(idx.allPages))
copy(sorted, idx.allPages)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].ModTime.After(sorted[j].ModTime)
})
if len(sorted) > n {
sorted = sorted[:n]
}
return sorted
}
func (idx *Index) GetGraphData() GraphData {
idx.mu.RLock()
defer idx.mu.RUnlock()
nodeMap := make(map[string]bool)
var nodes []GraphNode
var links []GraphLink
// Add all pages as nodes
for _, page := range idx.allPages {
size := len(idx.backlinks[page.URLPath]) + len(idx.outlinks[page.URLPath])
if size == 0 {
size = 1
}
nodes = append(nodes, GraphNode{
ID: page.URLPath,
Title: page.Title,
Domain: page.Domain,
Size: size,
})
nodeMap[page.URLPath] = true
}
// Add links
for source, targets := range idx.outlinks {
for _, target := range targets {
if nodeMap[source] && nodeMap[target] {
links = append(links, GraphLink{
Source: source,
Target: target,
})
}
}
}
return GraphData{Nodes: nodes, Links: links}
}
func (idx *Index) ListDir(relDir string) ([]DirEntry, error) {
idx.mu.RLock()
defer idx.mu.RUnlock()
absDir := filepath.Join(idx.docsRoot, relDir)
entries, err := os.ReadDir(absDir)
if err != nil {
return nil, err
}
var result []DirEntry
for _, e := range entries {
name := e.Name()
entryRel := filepath.Join(relDir, name)
if idx.isExcluded(entryRel) || strings.HasPrefix(name, ".") {
continue
}
info, err := e.Info()
if err != nil {
continue
}
if e.IsDir() {
result = append(result, DirEntry{Name: name, URLPath: entryRel, IsDir: true, ModTime: info.ModTime()})
} else if strings.HasSuffix(name, ".md") {
result = append(result, DirEntry{Name: name, URLPath: strings.TrimSuffix(entryRel, ".md"), IsDir: false, ModTime: info.ModTime(), Size: info.Size()})
} else {
result = append(result, DirEntry{Name: name, URLPath: entryRel, IsDir: false, ModTime: info.ModTime(), Size: info.Size()})
}
}
return result, nil
}
func (idx *Index) Watch(onChange func(string)) {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return
}
go func() {
var debounce *time.Timer
for {
select {
case event, ok := <-watcher.Events:
if !ok {
return
}
if event.Has(fsnotify.Create) || event.Has(fsnotify.Write) || event.Has(fsnotify.Remove) || event.Has(fsnotify.Rename) {
changedPath := ""
if rel, err := filepath.Rel(idx.docsRoot, event.Name); err == nil {
changedPath = strings.TrimSuffix(rel, ".md")
}
if debounce != nil {
debounce.Stop()
}
debounce = time.AfterFunc(500*time.Millisecond, func() {
idx.Rebuild()
if onChange != nil {
onChange(changedPath)
}
})
}
case _, ok := <-watcher.Errors:
if !ok {
return
}
}
}
}()
filepath.Walk(idx.docsRoot, func(path string, info os.FileInfo, err error) error {
if err != nil || !info.IsDir() {
return nil
}
relPath, _ := filepath.Rel(idx.docsRoot, path)
if relPath != "." && idx.isExcluded(relPath) {
return filepath.SkipDir
}
watcher.Add(path)
return nil
})
}
func makeTrigrams(s string) map[string]bool {
result := make(map[string]bool)
runes := []rune(s)
for i := 0; i+2 < len(runes); i++ {
result[string(runes[i:i+3])] = true
}
return result
}
func (idx *Index) GetContentForSearch(urlPath string) string {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.contentCache[urlPath]
}
func (idx *Index) GetDomainHealth() []DomainHealth {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.cachedDomainHealth
}
func (idx *Index) GetCachedLinkSuggestions() []LinkSuggestion {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.cachedLinkSuggestions
}
func (idx *Index) GetCachedDuplicates() []DuplicateGroup {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.cachedDuplicates
}
func (idx *Index) IsAnalyticsReady() bool {
idx.mu.RLock()
defer idx.mu.RUnlock()
return idx.analyticsReady
}
func (idx *Index) computeAnalytics() {
// Domain health (fast, O(n))
health := idx.computeDomainHealth()
// Link suggestions (slow, O(n²) — but we're in a goroutine)
suggestions := idx.GetLinkSuggestions(30)
// Duplicates (slow, O(n²))
duplicates := idx.GetDuplicates(0.6)
idx.mu.Lock()
idx.cachedDomainHealth = health
idx.cachedLinkSuggestions = suggestions
idx.cachedDuplicates = duplicates
idx.analyticsReady = true
idx.mu.Unlock()
}
func (idx *Index) computeDomainHealth() []DomainHealth {
idx.mu.RLock()
defer idx.mu.RUnlock()
now := time.Now()
thirtyDaysAgo := now.AddDate(0, 0, -30)
healthMap := make(map[string]*DomainHealth)
for _, dom := range idx.domains {
healthMap[dom.FullDir] = &DomainHealth{
Domain: dom.FullDir, Color: dom.Color, TotalPages: dom.Count,
}
}
for _, page := range idx.allPages {
h, ok := healthMap[page.Domain]
if !ok {
continue
}
if len(idx.backlinks[page.URLPath]) > 0 { h.PctLinked++ }
if len(page.Tags) > 0 { h.PctTagged++ }
if page.ModTime.After(thirtyDaysAgo) { h.PctRecent++ }
h.AvgLinks += float64(len(idx.outlinks[page.URLPath]))
}
var result []DomainHealth
for _, dom := range idx.domains {
h := healthMap[dom.FullDir]
if h.TotalPages > 0 {
h.PctLinked = h.PctLinked * 100 / h.TotalPages
h.PctTagged = h.PctTagged * 100 / h.TotalPages
h.PctRecent = h.PctRecent * 100 / h.TotalPages
h.AvgLinks = h.AvgLinks / float64(h.TotalPages)
}
result = append(result, *h)
}
return result
}