// lib/html-to-text.js — turn a fetched web page into clean, LLM-friendly text.
// Zero dependencies: pragmatic tag stripping + entity decoding.
const ENTITIES = {
amp: "&", lt: "<", gt: ">", quot: '"', apos: "'", nbsp: " ",
mdash: "—", ndash: "–", hellip: "…", rsquo: "'", lsquo: "'",
rdquo: '"', ldquo: '"', copy: "©", reg: "®", trade: "™",
deg: "°", frac12: "½", frac14: "¼", times: "×", divide: "÷",
eacute: "é", egrave: "è", agrave: "à", ccedil: "ç", uuml: "ü", ouml: "ö", auml: "ä",
};
function decodeEntities(s) {
return s
.replace(/([0-9a-f]+);/gi, (_, h) => safeChar(parseInt(h, 16)))
.replace(/(\d+);/g, (_, d) => safeChar(parseInt(d, 10)))
.replace(/&([a-z]+);/gi, (m, name) => ENTITIES[name.toLowerCase()] ?? m);
}
function safeChar(code) {
try { return String.fromCodePoint(code); } catch { return ""; }
}
export function extractTitle(html) {
const m = html.match(/
]*>([\s\S]*?)<\/title>/i);
if (!m) return "";
return decodeEntities(m[1]).replace(/\s+/g, " ").trim().slice(0, 200);
}
export function htmlToText(html) {
let s = String(html);
// Remove non-content blocks entirely
s = s.replace(//g, " ");
for (const tag of ["script", "style", "noscript", "svg", "iframe", "form", "nav", "footer", "header", "aside", "template", "button", "select"]) {
s = s.replace(new RegExp(`<${tag}[\\s\\S]*?<\\/${tag}>`, "gi"), " ");
}
// Preserve structure: headings, paragraphs, list items, table cells, breaks
s = s.replace(/<\/(h[1-6])>/gi, "\n\n");
s = s.replace(/<(h[1-6])[^>]*>/gi, "\n\n## ");
s = s.replace(/<\/(p|div|section|article|blockquote|tr|table|ul|ol|figcaption)>/gi, "\n");
s = s.replace(/]*>/gi, "\n- ");
s = s.replace(/<(td|th)[^>]*>/gi, " | ");
s = s.replace(/
/gi, "\n");
// Strip all remaining tags
s = s.replace(/<[^>]+>/g, " ");
s = decodeEntities(s);
// Normalize whitespace
s = s.replace(/\r/g, "");
s = s.replace(/[ \t]+/g, " ");
s = s.replace(/ ?\n ?/g, "\n");
s = s.replace(/\n{3,}/g, "\n\n");
// Drop very short junk lines (menus, single links) when the doc is large
const lines = s.split("\n").map((l) => l.trim());
const kept = [];
for (const line of lines) {
if (!line) { kept.push(""); continue; }
if (line.length < 3 && !/^[-#\d]/.test(line)) continue;
kept.push(line);
}
s = kept.join("\n").replace(/\n{3,}/g, "\n\n").trim();
return s;
}