veza/veza-docs/scripts/crawl.mjs

#!/usr/bin/env node

import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

/**
 * Crawler pour détecter les liens cassés dans la documentation Veza
 * Génère un inventaire des slugs et détecte les liens cassés
 */

// Configuration
const DOCS_DIR = path.join(__dirname, '..', 'docs');
const META_DIR = path.join(__dirname, '..', 'meta');
const OUTPUT_CSV = path.join(META_DIR, 'broken-links.csv');
const SLUG_INVENTORY = path.join(META_DIR, 'slug-inventory.json');

// Patterns pour détecter les liens
const LINK_PATTERNS = [
  // Liens internes Docusaurus
  /\[([^\]]+)\]\(\/docs\/([^)]+)\)/g,
  // Liens avec ancres
  /\[([^\]]+)\]\(\/docs\/([^#)]+)(#[^)]+)?\)/g,
  // Liens relatifs
  /\[([^\]]+)\]\(([^)]+\.mdx?)(#[^)]+)?\)/g,
  // Liens avec placeholders
  /\[([^\]]+)\]\(\/docs\/([^{}]+)\{([^}]+)\}([^)]+)?\)/g,
];

// Inventaire des slugs existants
const slugInventory = new Map();
const brokenLinks = [];

/**
 * Extrait tous les slugs des fichiers MDX
 */
function extractSlugs() {
  console.log('🔍 Extraction des slugs...');

  function scanDirectory(dir) {
    const items = fs.readdirSync(dir, { withFileTypes: true });

    for (const item of items) {
      const fullPath = path.join(dir, item.name);

      if (item.isDirectory()) {
        scanDirectory(fullPath);
      } else if (item.isFile() && (item.name.endsWith('.mdx') || item.name.endsWith('.md'))) {
        const relativePath = path.relative(DOCS_DIR, fullPath);
        const slug = relativePath
          .replace(/\.mdx?$/, '')
          .replace(/\\/g, '/');

        // Ajouter différentes variantes du slug
        slugInventory.set(slug, {
          file: relativePath,
          fullPath,
          exists: true
        });

        // Ajouter la version avec index
        if (item.name === 'index.mdx' || item.name === 'index.md') {
          const parentSlug = path.dirname(slug);
          if (parentSlug !== '.') {
            slugInventory.set(parentSlug, {
              file: relativePath,
              fullPath,
              exists: true
            });
          }
        }
      }
    }
  }

  scanDirectory(DOCS_DIR);
  console.log(`✅ ${slugInventory.size} slugs extraits`);
}

/**
 * Analyse un fichier pour détecter les liens cassés
 */
function analyzeFile(filePath) {
  const content = fs.readFileSync(filePath, 'utf8');
  const relativePath = path.relative(DOCS_DIR, filePath);

  // Diviser le contenu en lignes pour détecter les blocs de code
  const lines = content.split('\n');
  let inCodeBlock = false;
  let codeBlockType = '';

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];

    // Détecter le début d'un bloc de code
    if (line.trim().startsWith('```')) {
      if (!inCodeBlock) {
        inCodeBlock = true;
        codeBlockType = line.trim().substring(3);
      } else {
        inCodeBlock = false;
        codeBlockType = '';
      }
      continue;
    }

    // Ignorer les lignes dans des blocs de code
    if (inCodeBlock) {
      continue;
    }

    // Analyser les liens dans cette ligne
    for (const pattern of LINK_PATTERNS) {
      let match;
      while ((match = pattern.exec(line)) !== null) {
        const [fullMatch, linkText, targetPath, anchor] = match;

        // Nettoyer le chemin cible
        let cleanPath = targetPath;
        if (cleanPath.startsWith('/docs/')) {
          cleanPath = cleanPath.substring(6); // Enlever /docs/
        }

        // Vérifier si le lien est cassé
        const isBroken = !isValidLink(cleanPath, anchor);

        if (isBroken) {
          brokenLinks.push({
            sourceFile: relativePath,
            linkText: linkText.trim(),
            targetPath: cleanPath,
            anchor: anchor || '',
            fullMatch,
            line: i + 1,
            type: detectLinkType(fullMatch)
          });
        }
      }
    }
  }
}

/**
 * Vérifie si un lien est valide
 */
function isValidLink(targetPath, anchor = '') {
  // Nettoyer le chemin
  let cleanPath = targetPath;

  // Enlever les placeholders temporairement
  cleanPath = cleanPath.replace(/\{[^}]+\}/g, 'PLACEHOLDER');

  // Vérifier les variantes possibles
  const variants = [
    cleanPath,
    cleanPath + '/index',
    cleanPath.replace(/\/$/, ''),
    cleanPath.replace(/\/$/, '') + '/index'
  ];

  for (const variant of variants) {
    if (slugInventory.has(variant)) {
      return true;
    }
  }

  // Vérifier les liens externes
  if (targetPath.startsWith('http') || targetPath.startsWith('mailto:')) {
    return true;
  }

  return false;
}

/**
 * Détecte le type de lien
 */
function detectLinkType(fullMatch) {
  if (fullMatch.includes('{')) return 'placeholder';
  if (fullMatch.includes('http')) return 'external';
  if (fullMatch.includes('/docs/')) return 'internal';
  return 'relative';
}

/**
 * Obtient le numéro de ligne d'une correspondance
 */
function getLineNumber(content, match) {
  const lines = content.substring(0, content.indexOf(match)).split('\n');
  return lines.length;
}

/**
 * Scanne tous les fichiers pour les liens cassés
 */
function scanBrokenLinks() {
  console.log('🔍 Scan des liens cassés...');

  function scanDirectory(dir) {
    const items = fs.readdirSync(dir, { withFileTypes: true });

    for (const item of items) {
      const fullPath = path.join(dir, item.name);

      if (item.isDirectory()) {
        scanDirectory(fullPath);
      } else if (item.isFile() && (item.name.endsWith('.mdx') || item.name.endsWith('.md'))) {
        analyzeFile(fullPath);
      }
    }
  }

  scanDirectory(DOCS_DIR);
  console.log(`❌ ${brokenLinks.length} liens cassés détectés`);
}

/**
 * Génère le CSV des liens cassés
 */
function generateCSV() {
  console.log('📊 Génération du CSV...');

  const csvHeader = 'Source File,Link Text,Target Path,Anchor,Type,Line,Full Match\n';
  const csvRows = brokenLinks.map(link =>
    `"${link.sourceFile}","${link.linkText}","${link.targetPath}","${link.anchor}","${link.type}",${link.line},"${link.fullMatch.replace(/"/g, '""')}"`
  ).join('\n');

  const csvContent = csvHeader + csvRows;
  fs.writeFileSync(OUTPUT_CSV, csvContent, 'utf8');
  console.log(`✅ CSV généré: ${OUTPUT_CSV}`);
}

/**
 * Génère l'inventaire des slugs
 */
function generateSlugInventory() {
  console.log('📋 Génération de l\'inventaire des slugs...');

  const inventory = {
    generated: new Date().toISOString(),
    totalSlugs: slugInventory.size,
    slugs: Object.fromEntries(slugInventory)
  };

  fs.writeFileSync(SLUG_INVENTORY, JSON.stringify(inventory, null, 2), 'utf8');
  console.log(`✅ Inventaire généré: ${SLUG_INVENTORY}`);
}

/**
 * Génère un rapport de synthèse
 */
function generateReport() {
  const report = {
    timestamp: new Date().toISOString(),
    totalSlugs: slugInventory.size,
    brokenLinks: brokenLinks.length,
    brokenLinksByType: brokenLinks.reduce((acc, link) => {
      acc[link.type] = (acc[link.type] || 0) + 1;
      return acc;
    }, {}),
    brokenLinksByFile: brokenLinks.reduce((acc, link) => {
      acc[link.sourceFile] = (acc[link.sourceFile] || 0) + 1;
      return acc;
    }, {})
  };

  console.log('\n📊 RAPPORT DE SYNTHÈSE');
  console.log('====================');
  console.log(`Total slugs: ${report.totalSlugs}`);
  console.log(`Liens cassés: ${report.brokenLinks}`);
  console.log('\nPar type:');
  Object.entries(report.brokenLinksByType).forEach(([type, count]) => {
    console.log(`  ${type}: ${count}`);
  });
  console.log('\nPar fichier:');
  Object.entries(report.brokenLinksByFile).forEach(([file, count]) => {
    console.log(`  ${file}: ${count}`);
  });
}

// Exécution principale
async function main() {
  console.log('🚀 Démarrage du crawler Veza Docs');
  console.log('==================================');

  // Créer le répertoire meta s'il n'existe pas
  if (!fs.existsSync(META_DIR)) {
    fs.mkdirSync(META_DIR, { recursive: true });
  }

  try {
    extractSlugs();
    scanBrokenLinks();
    generateSlugInventory();
    generateCSV();
    generateReport();

    console.log('\n✅ Crawler terminé avec succès');
    process.exit(0);
  } catch (error) {
    console.error('❌ Erreur lors du crawl:', error);
    process.exit(1);
  }
}

main();