Knowledge base of ~80+ markdown files across 14 domains (00-13), Logseq graph, hardware design files (KiCAD), infrastructure configs, and talas-wiki static site. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
297 lines
6.9 KiB
Go
297 lines
6.9 KiB
Go
package wiki
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// LinkSuggestion represents a page mentioned but not linked
|
|
type LinkSuggestion struct {
|
|
Source string // page that mentions the target
|
|
Target string // page URL path that could be linked
|
|
TargetTitle string
|
|
Mentions int // how many times the name appears
|
|
}
|
|
|
|
// SimilarPage represents a page similar to another
|
|
type SimilarPage struct {
|
|
Page *Page
|
|
Similarity float64
|
|
}
|
|
|
|
// DuplicateGroup groups pages with very similar content
|
|
type DuplicateGroup struct {
|
|
Pages []*Page
|
|
Similarity float64
|
|
}
|
|
|
|
// DomainDep shows how many links connect two domains
|
|
type DomainDep struct {
|
|
Source string `json:"source"`
|
|
Target string `json:"target"`
|
|
Weight int `json:"weight"`
|
|
}
|
|
|
|
// TimelineEntry represents activity on a single day
|
|
type TimelineEntry struct {
|
|
Date string `json:"date"` // "2006-01-02"
|
|
Count int `json:"count"`
|
|
Pages []string `json:"pages,omitempty"`
|
|
}
|
|
|
|
// GetLinkSuggestions finds pages that mention other page names without linking to them
|
|
func (idx *Index) GetLinkSuggestions(maxResults int) []LinkSuggestion {
|
|
idx.mu.RLock()
|
|
defer idx.mu.RUnlock()
|
|
|
|
var suggestions []LinkSuggestion
|
|
|
|
for _, page := range idx.allPages {
|
|
contentLower := idx.contentCache[page.URLPath]
|
|
if contentLower == "" {
|
|
continue
|
|
}
|
|
|
|
// Check which pages are mentioned by name but not linked
|
|
outlinked := make(map[string]bool)
|
|
for _, target := range idx.outlinks[page.URLPath] {
|
|
outlinked[target] = true
|
|
}
|
|
|
|
for _, candidate := range idx.allPages {
|
|
if candidate.URLPath == page.URLPath {
|
|
continue
|
|
}
|
|
if outlinked[candidate.URLPath] {
|
|
continue
|
|
}
|
|
|
|
nameLower := strings.ToLower(candidate.Name)
|
|
if len(nameLower) < 4 {
|
|
continue // skip very short names (README etc)
|
|
}
|
|
|
|
// Count mentions of the candidate name
|
|
count := strings.Count(contentLower, nameLower)
|
|
if count > 0 {
|
|
suggestions = append(suggestions, LinkSuggestion{
|
|
Source: page.URLPath,
|
|
Target: candidate.URLPath,
|
|
TargetTitle: candidate.Title,
|
|
Mentions: count,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
sort.Slice(suggestions, func(i, j int) bool {
|
|
return suggestions[i].Mentions > suggestions[j].Mentions
|
|
})
|
|
|
|
if len(suggestions) > maxResults {
|
|
suggestions = suggestions[:maxResults]
|
|
}
|
|
return suggestions
|
|
}
|
|
|
|
// GetSimilarPages finds pages with content similar to the given page
|
|
func (idx *Index) GetSimilarPages(urlPath string, maxResults int) []SimilarPage {
|
|
idx.mu.RLock()
|
|
defer idx.mu.RUnlock()
|
|
|
|
source := idx.contentCache[urlPath]
|
|
if source == "" {
|
|
return nil
|
|
}
|
|
|
|
sourceTrigrams := makeTrigrams(source[:min(len(source), 2000)])
|
|
|
|
var results []SimilarPage
|
|
for _, page := range idx.allPages {
|
|
if page.URLPath == urlPath {
|
|
continue
|
|
}
|
|
content := idx.contentCache[page.URLPath]
|
|
if content == "" {
|
|
continue
|
|
}
|
|
|
|
targetTrigrams := makeTrigrams(content[:min(len(content), 2000)])
|
|
|
|
common := 0
|
|
for t := range sourceTrigrams {
|
|
if targetTrigrams[t] {
|
|
common++
|
|
}
|
|
}
|
|
total := len(sourceTrigrams)
|
|
if len(targetTrigrams) > total {
|
|
total = len(targetTrigrams)
|
|
}
|
|
if total == 0 {
|
|
continue
|
|
}
|
|
sim := float64(common) / float64(total)
|
|
if sim > 0.25 {
|
|
results = append(results, SimilarPage{Page: page, Similarity: sim})
|
|
}
|
|
}
|
|
|
|
sort.Slice(results, func(i, j int) bool {
|
|
return results[i].Similarity > results[j].Similarity
|
|
})
|
|
|
|
if len(results) > maxResults {
|
|
results = results[:maxResults]
|
|
}
|
|
return results
|
|
}
|
|
|
|
// GetDuplicates finds groups of pages with very similar content
|
|
func (idx *Index) GetDuplicates(threshold float64) []DuplicateGroup {
|
|
idx.mu.RLock()
|
|
defer idx.mu.RUnlock()
|
|
|
|
type pair struct {
|
|
a, b *Page
|
|
sim float64
|
|
}
|
|
var pairs []pair
|
|
|
|
pages := idx.allPages
|
|
for i := 0; i < len(pages); i++ {
|
|
contentA := idx.contentCache[pages[i].URLPath]
|
|
if len(contentA) < 100 {
|
|
continue
|
|
}
|
|
triA := makeTrigrams(contentA[:min(len(contentA), 1500)])
|
|
|
|
for j := i + 1; j < len(pages); j++ {
|
|
contentB := idx.contentCache[pages[j].URLPath]
|
|
if len(contentB) < 100 {
|
|
continue
|
|
}
|
|
triB := makeTrigrams(contentB[:min(len(contentB), 1500)])
|
|
|
|
common := 0
|
|
for t := range triA {
|
|
if triB[t] {
|
|
common++
|
|
}
|
|
}
|
|
total := len(triA)
|
|
if len(triB) > total {
|
|
total = len(triB)
|
|
}
|
|
if total > 0 {
|
|
sim := float64(common) / float64(total)
|
|
if sim >= threshold {
|
|
pairs = append(pairs, pair{pages[i], pages[j], sim})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Simple grouping: each pair becomes a group
|
|
var groups []DuplicateGroup
|
|
for _, p := range pairs {
|
|
groups = append(groups, DuplicateGroup{
|
|
Pages: []*Page{p.a, p.b},
|
|
Similarity: p.sim,
|
|
})
|
|
}
|
|
|
|
sort.Slice(groups, func(i, j int) bool {
|
|
return groups[i].Similarity > groups[j].Similarity
|
|
})
|
|
|
|
if len(groups) > 20 {
|
|
groups = groups[:20]
|
|
}
|
|
return groups
|
|
}
|
|
|
|
// GetDomainDeps returns inter-domain link counts
|
|
func (idx *Index) GetDomainDeps() []DomainDep {
|
|
idx.mu.RLock()
|
|
defer idx.mu.RUnlock()
|
|
|
|
counts := make(map[string]int) // "source|target" → count
|
|
|
|
for source, targets := range idx.outlinks {
|
|
srcPage := idx.byFullPath[source]
|
|
if srcPage == nil || srcPage.Domain == "" {
|
|
continue
|
|
}
|
|
for _, target := range targets {
|
|
tgtPage := idx.byFullPath[target]
|
|
if tgtPage == nil || tgtPage.Domain == "" || tgtPage.Domain == srcPage.Domain {
|
|
continue
|
|
}
|
|
key := srcPage.Domain + "|" + tgtPage.Domain
|
|
counts[key]++
|
|
}
|
|
}
|
|
|
|
var deps []DomainDep
|
|
for key, count := range counts {
|
|
parts := strings.SplitN(key, "|", 2)
|
|
deps = append(deps, DomainDep{Source: parts[0], Target: parts[1], Weight: count})
|
|
}
|
|
|
|
sort.Slice(deps, func(i, j int) bool {
|
|
return deps[i].Weight > deps[j].Weight
|
|
})
|
|
return deps
|
|
}
|
|
|
|
// GetTimeline returns modification activity per day for the last N days
|
|
func (idx *Index) GetTimeline(days int) []TimelineEntry {
|
|
idx.mu.RLock()
|
|
defer idx.mu.RUnlock()
|
|
|
|
now := time.Now()
|
|
dayMap := make(map[string][]string)
|
|
|
|
for _, page := range idx.allPages {
|
|
dateStr := page.ModTime.Format("2006-01-02")
|
|
if now.Sub(page.ModTime).Hours() < float64(days*24) {
|
|
dayMap[dateStr] = append(dayMap[dateStr], page.Title)
|
|
}
|
|
}
|
|
|
|
var entries []TimelineEntry
|
|
for d := 0; d < days; d++ {
|
|
date := now.AddDate(0, 0, -d)
|
|
dateStr := date.Format("2006-01-02")
|
|
pages := dayMap[dateStr]
|
|
entries = append(entries, TimelineEntry{
|
|
Date: dateStr,
|
|
Count: len(pages),
|
|
Pages: pages,
|
|
})
|
|
}
|
|
return entries
|
|
}
|
|
|
|
// GetPageSummary returns the first meaningful non-heading line as a summary
|
|
func (idx *Index) GetPageSummary(urlPath string) string {
|
|
content := idx.GetContentForSearch(urlPath)
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
lines := strings.Split(content, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" || strings.HasPrefix(line, "#") || strings.HasPrefix(line, "---") ||
|
|
strings.Contains(line, "::") || strings.HasPrefix(line, ">") || strings.HasPrefix(line, "|") {
|
|
continue
|
|
}
|
|
if len(line) > 160 {
|
|
return line[:160] + "..."
|
|
}
|
|
return line
|
|
}
|
|
return ""
|
|
}
|