// lib/html-to-text.js — turn a fetched web page into clean, LLM-friendly text. // Zero dependencies: pragmatic tag stripping + entity decoding. const ENTITIES = { amp: "&", lt: "<", gt: ">", quot: '"', apos: "'", nbsp: " ", mdash: "—", ndash: "–", hellip: "…", rsquo: "'", lsquo: "'", rdquo: '"', ldquo: '"', copy: "©", reg: "®", trade: "™", deg: "°", frac12: "½", frac14: "¼", times: "×", divide: "÷", eacute: "é", egrave: "è", agrave: "à", ccedil: "ç", uuml: "ü", ouml: "ö", auml: "ä", }; function decodeEntities(s) { return s .replace(/&#x([0-9a-f]+);/gi, (_, h) => safeChar(parseInt(h, 16))) .replace(/&#(\d+);/g, (_, d) => safeChar(parseInt(d, 10))) .replace(/&([a-z]+);/gi, (m, name) => ENTITIES[name.toLowerCase()] ?? m); } function safeChar(code) { try { return String.fromCodePoint(code); } catch { return ""; } } export function extractTitle(html) { const m = html.match(/]*>([\s\S]*?)<\/title>/i); if (!m) return ""; return decodeEntities(m[1]).replace(/\s+/g, " ").trim().slice(0, 200); } export function htmlToText(html) { let s = String(html); // Remove non-content blocks entirely s = s.replace(//g, " "); for (const tag of ["script", "style", "noscript", "svg", "iframe", "form", "nav", "footer", "header", "aside", "template", "button", "select"]) { s = s.replace(new RegExp(`<${tag}[\\s\\S]*?<\\/${tag}>`, "gi"), " "); } // Preserve structure: headings, paragraphs, list items, table cells, breaks s = s.replace(/<\/(h[1-6])>/gi, "\n\n"); s = s.replace(/<(h[1-6])[^>]*>/gi, "\n\n## "); s = s.replace(/<\/(p|div|section|article|blockquote|tr|table|ul|ol|figcaption)>/gi, "\n"); s = s.replace(/]*>/gi, "\n- "); s = s.replace(/<(td|th)[^>]*>/gi, " | "); s = s.replace(//gi, "\n"); // Strip all remaining tags s = s.replace(/<[^>]+>/g, " "); s = decodeEntities(s); // Normalize whitespace s = s.replace(/\r/g, ""); s = s.replace(/[ \t]+/g, " "); s = s.replace(/ ?\n ?/g, "\n"); s = s.replace(/\n{3,}/g, "\n\n"); // Drop very short junk lines (menus, single links) when the doc is large const lines = s.split("\n").map((l) => l.trim()); const kept = []; for (const line of lines) { if (!line) { kept.push(""); continue; } if (line.length < 3 && !/^[-#\d]/.test(line)) continue; kept.push(line); } s = kept.join("\n").replace(/\n{3,}/g, "\n\n").trim(); return s; }