Self-contained Dockerized build for end users. Run via docker compose; see README.md for setup. Source-only, no sample data or build artifacts. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
68 lines
2.4 KiB
JavaScript
68 lines
2.4 KiB
JavaScript
// lib/html-to-text.js — turn a fetched web page into clean, LLM-friendly text.
|
||
// Zero dependencies: pragmatic tag stripping + entity decoding.
|
||
|
||
const ENTITIES = {
|
||
amp: "&", lt: "<", gt: ">", quot: '"', apos: "'", nbsp: " ",
|
||
mdash: "—", ndash: "–", hellip: "…", rsquo: "'", lsquo: "'",
|
||
rdquo: '"', ldquo: '"', copy: "©", reg: "®", trade: "™",
|
||
deg: "°", frac12: "½", frac14: "¼", times: "×", divide: "÷",
|
||
eacute: "é", egrave: "è", agrave: "à", ccedil: "ç", uuml: "ü", ouml: "ö", auml: "ä",
|
||
};
|
||
|
||
function decodeEntities(s) {
|
||
return s
|
||
.replace(/&#x([0-9a-f]+);/gi, (_, h) => safeChar(parseInt(h, 16)))
|
||
.replace(/&#(\d+);/g, (_, d) => safeChar(parseInt(d, 10)))
|
||
.replace(/&([a-z]+);/gi, (m, name) => ENTITIES[name.toLowerCase()] ?? m);
|
||
}
|
||
|
||
function safeChar(code) {
|
||
try { return String.fromCodePoint(code); } catch { return ""; }
|
||
}
|
||
|
||
export function extractTitle(html) {
|
||
const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||
if (!m) return "";
|
||
return decodeEntities(m[1]).replace(/\s+/g, " ").trim().slice(0, 200);
|
||
}
|
||
|
||
export function htmlToText(html) {
|
||
let s = String(html);
|
||
|
||
// Remove non-content blocks entirely
|
||
s = s.replace(/<!--[\s\S]*?-->/g, " ");
|
||
for (const tag of ["script", "style", "noscript", "svg", "iframe", "form", "nav", "footer", "header", "aside", "template", "button", "select"]) {
|
||
s = s.replace(new RegExp(`<${tag}[\\s\\S]*?<\\/${tag}>`, "gi"), " ");
|
||
}
|
||
|
||
// Preserve structure: headings, paragraphs, list items, table cells, breaks
|
||
s = s.replace(/<\/(h[1-6])>/gi, "\n\n");
|
||
s = s.replace(/<(h[1-6])[^>]*>/gi, "\n\n## ");
|
||
s = s.replace(/<\/(p|div|section|article|blockquote|tr|table|ul|ol|figcaption)>/gi, "\n");
|
||
s = s.replace(/<li[^>]*>/gi, "\n- ");
|
||
s = s.replace(/<(td|th)[^>]*>/gi, " | ");
|
||
s = s.replace(/<br\s*\/?>/gi, "\n");
|
||
|
||
// Strip all remaining tags
|
||
s = s.replace(/<[^>]+>/g, " ");
|
||
|
||
s = decodeEntities(s);
|
||
|
||
// Normalize whitespace
|
||
s = s.replace(/\r/g, "");
|
||
s = s.replace(/[ \t]+/g, " ");
|
||
s = s.replace(/ ?\n ?/g, "\n");
|
||
s = s.replace(/\n{3,}/g, "\n\n");
|
||
|
||
// Drop very short junk lines (menus, single links) when the doc is large
|
||
const lines = s.split("\n").map((l) => l.trim());
|
||
const kept = [];
|
||
for (const line of lines) {
|
||
if (!line) { kept.push(""); continue; }
|
||
if (line.length < 3 && !/^[-#\d]/.test(line)) continue;
|
||
kept.push(line);
|
||
}
|
||
s = kept.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
||
return s;
|
||
}
|